intel/fs: Emit HALT for discard on Gen4-5
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 assert(devinfo->gen < 12);
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct gen_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 void
89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91 const struct gen_device_info *devinfo = p->devinfo;
92
93 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
95 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96 assert(dest.nr < 128);
97
98 /* The hardware has a restriction where a destination of size Byte with
99 * a stride of 1 is only allowed for a packed byte MOV. For any other
100 * instruction, the stride must be at least 2, even when the destination
101 * is the NULL register.
102 */
103 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104 dest.nr == BRW_ARF_NULL &&
105 type_sz(dest.type) == 1 &&
106 dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108 }
109
110 gen7_convert_mrf_to_grf(p, &dest);
111
112 if (devinfo->gen >= 12 &&
113 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118 assert(dest.subnr == 0);
119 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121 dest.vstride == dest.width + 1));
122 assert(!dest.negate && !dest.abs);
123 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125
126 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128 assert(devinfo->gen < 12);
129 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132 assert(dest.subnr % 16 == 0);
133 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134 dest.vstride == dest.width + 1);
135 assert(!dest.negate && !dest.abs);
136 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139 } else {
140 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142
143 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145
146 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151 } else {
152 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156 assert(dest.writemask != 0);
157 }
158 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159 * Although Dst.HorzStride is a don't care for Align16, HW needs
160 * this to be programmed as "01".
161 */
162 brw_inst_set_dst_hstride(devinfo, inst, 1);
163 }
164 } else {
165 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166
167 /* These are different sizes in align1 vs align16:
168 */
169 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171 dest.indirect_offset);
172 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175 } else {
176 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177 dest.indirect_offset);
178 /* even ignored in da16, still need to set as '01' */
179 brw_inst_set_dst_hstride(devinfo, inst, 1);
180 }
181 }
182 }
183
184 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185 * or 16 (SIMD16), as that's normally correct. However, when dealing with
186 * small registers, it can be useful for us to automatically reduce it to
187 * match the register size.
188 */
189 if (p->automatic_exec_sizes) {
190 /*
191 * In platforms that support fp64 we can emit instructions with a width
192 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193 * these cases we need to make sure that these instructions have their
194 * exec sizes set properly when they are emitted and we can't rely on
195 * this code to fix it.
196 */
197 bool fix_exec_size;
198 if (devinfo->gen >= 6)
199 fix_exec_size = dest.width < BRW_EXECUTE_4;
200 else
201 fix_exec_size = dest.width < BRW_EXECUTE_8;
202
203 if (fix_exec_size)
204 brw_inst_set_exec_size(devinfo, inst, dest.width);
205 }
206 }
207
208 void
209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211 const struct gen_device_info *devinfo = p->devinfo;
212
213 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
215 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216 assert(reg.nr < 128);
217
218 gen7_convert_mrf_to_grf(p, &reg);
219
220 if (devinfo->gen >= 6 &&
221 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225 /* Any source modifiers or regions will be ignored, since this just
226 * identifies the MRF/GRF to start reading the message contents from.
227 * Check for some likely failures.
228 */
229 assert(!reg.negate);
230 assert(!reg.abs);
231 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232 }
233
234 if (devinfo->gen >= 12 &&
235 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237 assert(reg.file != BRW_IMMEDIATE_VALUE);
238 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239 assert(reg.subnr == 0);
240 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
241 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242 reg.vstride == reg.width + 1));
243 assert(!reg.negate && !reg.abs);
244 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246
247 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251 assert(reg.subnr % 16 == 0);
252 assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
253 reg.vstride == reg.width + 1);
254 assert(!reg.negate && !reg.abs);
255 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
256 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
257 } else {
258 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
259 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
260 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
261 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
262
263 if (reg.file == BRW_IMMEDIATE_VALUE) {
264 if (reg.type == BRW_REGISTER_TYPE_DF ||
265 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
266 brw_inst_set_imm_df(devinfo, inst, reg.df);
267 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
268 reg.type == BRW_REGISTER_TYPE_Q)
269 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
270 else
271 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
272
273 if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
274 brw_inst_set_src1_reg_file(devinfo, inst,
275 BRW_ARCHITECTURE_REGISTER_FILE);
276 brw_inst_set_src1_reg_hw_type(devinfo, inst,
277 brw_inst_src0_reg_hw_type(devinfo, inst));
278 }
279 } else {
280 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
281 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
282 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
283 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
284 } else {
285 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
286 }
287 } else {
288 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
289
290 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
291 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
292 } else {
293 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
294 }
295 }
296
297 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
298 if (reg.width == BRW_WIDTH_1 &&
299 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
300 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
301 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
302 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
303 } else {
304 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
305 brw_inst_set_src0_width(devinfo, inst, reg.width);
306 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
307 }
308 } else {
309 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
310 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
311 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
312 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
313 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
314 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
315 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
316 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
317
318 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
319 /* This is an oddity of the fact we're using the same
320 * descriptions for registers in align_16 as align_1:
321 */
322 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
323 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
324 reg.type == BRW_REGISTER_TYPE_DF &&
325 reg.vstride == BRW_VERTICAL_STRIDE_2) {
326 /* From SNB PRM:
327 *
328 * "For Align16 access mode, only encodings of 0000 and 0011
329 * are allowed. Other codes are reserved."
330 *
331 * Presumably the DevSNB behavior applies to IVB as well.
332 */
333 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
334 } else {
335 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
336 }
337 }
338 }
339 }
340 }
341
342
343 void
344 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
345 {
346 const struct gen_device_info *devinfo = p->devinfo;
347
348 if (reg.file == BRW_GENERAL_REGISTER_FILE)
349 assert(reg.nr < 128);
350
351 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
352 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
353 (devinfo->gen >= 12 &&
354 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
355 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
356 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
357 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
358 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
359 assert(reg.subnr == 0);
360 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
361 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
362 reg.vstride == reg.width + 1));
363 assert(!reg.negate && !reg.abs);
364 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
365 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
366 } else {
367 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
368 *
369 * "Accumulator registers may be accessed explicitly as src0
370 * operands only."
371 */
372 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
373 reg.nr != BRW_ARF_ACCUMULATOR);
374
375 gen7_convert_mrf_to_grf(p, &reg);
376 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
377
378 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
379 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
380 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
381
382 /* Only src1 can be immediate in two-argument instructions.
383 */
384 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
385
386 if (reg.file == BRW_IMMEDIATE_VALUE) {
387 /* two-argument instructions can only use 32-bit immediates */
388 assert(type_sz(reg.type) < 8);
389 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
390 } else {
391 /* This is a hardware restriction, which may or may not be lifted
392 * in the future:
393 */
394 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
395 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
396
397 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
398 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
399 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
400 } else {
401 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
402 }
403
404 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
405 if (reg.width == BRW_WIDTH_1 &&
406 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
407 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
408 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
409 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
410 } else {
411 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
412 brw_inst_set_src1_width(devinfo, inst, reg.width);
413 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
414 }
415 } else {
416 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
417 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
418 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
419 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
420 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
421 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
422 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
423 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
424
425 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
426 /* This is an oddity of the fact we're using the same
427 * descriptions for registers in align_16 as align_1:
428 */
429 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
430 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
431 reg.type == BRW_REGISTER_TYPE_DF &&
432 reg.vstride == BRW_VERTICAL_STRIDE_2) {
433 /* From SNB PRM:
434 *
435 * "For Align16 access mode, only encodings of 0000 and 0011
436 * are allowed. Other codes are reserved."
437 *
438 * Presumably the DevSNB behavior applies to IVB as well.
439 */
440 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
441 } else {
442 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
443 }
444 }
445 }
446 }
447 }
448
449 /**
450 * Specify the descriptor and extended descriptor immediate for a SEND(C)
451 * message instruction.
452 */
453 void
454 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
455 unsigned desc, unsigned ex_desc)
456 {
457 const struct gen_device_info *devinfo = p->devinfo;
458 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
459 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
460 if (devinfo->gen < 12)
461 brw_inst_set_src1_file_type(devinfo, inst,
462 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
463 brw_inst_set_send_desc(devinfo, inst, desc);
464 if (devinfo->gen >= 9)
465 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
466 }
467
468 static void brw_set_math_message( struct brw_codegen *p,
469 brw_inst *inst,
470 unsigned function,
471 unsigned integer_type,
472 bool low_precision,
473 unsigned dataType )
474 {
475 const struct gen_device_info *devinfo = p->devinfo;
476 unsigned msg_length;
477 unsigned response_length;
478
479 /* Infer message length from the function */
480 switch (function) {
481 case BRW_MATH_FUNCTION_POW:
482 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
483 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
484 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
485 msg_length = 2;
486 break;
487 default:
488 msg_length = 1;
489 break;
490 }
491
492 /* Infer response length from the function */
493 switch (function) {
494 case BRW_MATH_FUNCTION_SINCOS:
495 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
496 response_length = 2;
497 break;
498 default:
499 response_length = 1;
500 break;
501 }
502
503 brw_set_desc(p, inst, brw_message_desc(
504 devinfo, msg_length, response_length, false));
505
506 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
507 brw_inst_set_math_msg_function(devinfo, inst, function);
508 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
509 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
510 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
511 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
512 brw_inst_set_saturate(devinfo, inst, 0);
513 }
514
515
516 static void brw_set_ff_sync_message(struct brw_codegen *p,
517 brw_inst *insn,
518 bool allocate,
519 unsigned response_length,
520 bool end_of_thread)
521 {
522 const struct gen_device_info *devinfo = p->devinfo;
523
524 brw_set_desc(p, insn, brw_message_desc(
525 devinfo, 1, response_length, true));
526
527 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
528 brw_inst_set_eot(devinfo, insn, end_of_thread);
529 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
530 brw_inst_set_urb_allocate(devinfo, insn, allocate);
531 /* The following fields are not used by FF_SYNC: */
532 brw_inst_set_urb_global_offset(devinfo, insn, 0);
533 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
534 brw_inst_set_urb_used(devinfo, insn, 0);
535 brw_inst_set_urb_complete(devinfo, insn, 0);
536 }
537
538 static void brw_set_urb_message( struct brw_codegen *p,
539 brw_inst *insn,
540 enum brw_urb_write_flags flags,
541 unsigned msg_length,
542 unsigned response_length,
543 unsigned offset,
544 unsigned swizzle_control )
545 {
546 const struct gen_device_info *devinfo = p->devinfo;
547
548 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
549 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
550 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
551
552 brw_set_desc(p, insn, brw_message_desc(
553 devinfo, msg_length, response_length, true));
554
555 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
556 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
557
558 if (flags & BRW_URB_WRITE_OWORD) {
559 assert(msg_length == 2); /* header + one OWORD of data */
560 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
561 } else {
562 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
563 }
564
565 brw_inst_set_urb_global_offset(devinfo, insn, offset);
566 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
567
568 if (devinfo->gen < 8) {
569 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
570 }
571
572 if (devinfo->gen < 7) {
573 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
574 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
575 } else {
576 brw_inst_set_urb_per_slot_offset(devinfo, insn,
577 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
578 }
579 }
580
581 static void
582 gen7_set_dp_scratch_message(struct brw_codegen *p,
583 brw_inst *inst,
584 bool write,
585 bool dword,
586 bool invalidate_after_read,
587 unsigned num_regs,
588 unsigned addr_offset,
589 unsigned mlen,
590 unsigned rlen,
591 bool header_present)
592 {
593 const struct gen_device_info *devinfo = p->devinfo;
594 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
595 (devinfo->gen >= 8 && num_regs == 8));
596 const unsigned block_size = (devinfo->gen >= 8 ? util_logbase2(num_regs) :
597 num_regs - 1);
598
599 brw_set_desc(p, inst, brw_message_desc(
600 devinfo, mlen, rlen, header_present));
601
602 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
603 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
604 brw_inst_set_scratch_read_write(devinfo, inst, write);
605 brw_inst_set_scratch_type(devinfo, inst, dword);
606 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
607 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
608 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
609 }
610
611 static void
612 brw_inst_set_state(const struct gen_device_info *devinfo,
613 brw_inst *insn,
614 const struct brw_insn_state *state)
615 {
616 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
617 brw_inst_set_group(devinfo, insn, state->group);
618 brw_inst_set_compression(devinfo, insn, state->compressed);
619 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
620 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
621 if (devinfo->gen >= 12)
622 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(state->swsb));
623 brw_inst_set_saturate(devinfo, insn, state->saturate);
624 brw_inst_set_pred_control(devinfo, insn, state->predicate);
625 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
626
627 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
628 state->access_mode == BRW_ALIGN_16) {
629 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
630 if (devinfo->gen >= 7)
631 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
632 } else {
633 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
634 if (devinfo->gen >= 7)
635 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
636 }
637
638 if (devinfo->gen >= 6)
639 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
640 }
641
642 #define next_insn brw_next_insn
643 brw_inst *
644 brw_next_insn(struct brw_codegen *p, unsigned opcode)
645 {
646 const struct gen_device_info *devinfo = p->devinfo;
647 brw_inst *insn;
648
649 if (p->nr_insn + 1 > p->store_size) {
650 p->store_size <<= 1;
651 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
652 }
653
654 p->next_insn_offset += 16;
655 insn = &p->store[p->nr_insn++];
656
657 memset(insn, 0, sizeof(*insn));
658 brw_inst_set_opcode(devinfo, insn, opcode);
659
660 /* Apply the default instruction state */
661 brw_inst_set_state(devinfo, insn, p->current);
662
663 return insn;
664 }
665
666 static brw_inst *
667 brw_alu1(struct brw_codegen *p, unsigned opcode,
668 struct brw_reg dest, struct brw_reg src)
669 {
670 brw_inst *insn = next_insn(p, opcode);
671 brw_set_dest(p, insn, dest);
672 brw_set_src0(p, insn, src);
673 return insn;
674 }
675
676 static brw_inst *
677 brw_alu2(struct brw_codegen *p, unsigned opcode,
678 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
679 {
680 /* 64-bit immediates are only supported on 1-src instructions */
681 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
682 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
683
684 brw_inst *insn = next_insn(p, opcode);
685 brw_set_dest(p, insn, dest);
686 brw_set_src0(p, insn, src0);
687 brw_set_src1(p, insn, src1);
688 return insn;
689 }
690
691 static int
692 get_3src_subreg_nr(struct brw_reg reg)
693 {
694 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
695 * use 32-bit units (components 0..7). Since they only support F/D/UD
696 * types, this doesn't lose any flexibility, but uses fewer bits.
697 */
698 return reg.subnr / 4;
699 }
700
701 static enum gen10_align1_3src_vertical_stride
702 to_3src_align1_vstride(const struct gen_device_info *devinfo,
703 enum brw_vertical_stride vstride)
704 {
705 switch (vstride) {
706 case BRW_VERTICAL_STRIDE_0:
707 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
708 case BRW_VERTICAL_STRIDE_1:
709 assert(devinfo->gen >= 12);
710 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
711 case BRW_VERTICAL_STRIDE_2:
712 assert(devinfo->gen < 12);
713 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
714 case BRW_VERTICAL_STRIDE_4:
715 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
716 case BRW_VERTICAL_STRIDE_8:
717 case BRW_VERTICAL_STRIDE_16:
718 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
719 default:
720 unreachable("invalid vstride");
721 }
722 }
723
724
725 static enum gen10_align1_3src_src_horizontal_stride
726 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
727 {
728 switch (hstride) {
729 case BRW_HORIZONTAL_STRIDE_0:
730 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
731 case BRW_HORIZONTAL_STRIDE_1:
732 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
733 case BRW_HORIZONTAL_STRIDE_2:
734 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
735 case BRW_HORIZONTAL_STRIDE_4:
736 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
737 default:
738 unreachable("invalid hstride");
739 }
740 }
741
742 static brw_inst *
743 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
744 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
745 {
746 const struct gen_device_info *devinfo = p->devinfo;
747 brw_inst *inst = next_insn(p, opcode);
748
749 gen7_convert_mrf_to_grf(p, &dest);
750
751 assert(dest.nr < 128);
752
753 if (devinfo->gen >= 10)
754 assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
755 src2.file == BRW_IMMEDIATE_VALUE));
756
757 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
758 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
759 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
760 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
761 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
762 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
763 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
764
765 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
766 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
767 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
768
769 if (devinfo->gen >= 12) {
770 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
771 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
772 } else {
773 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
774 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
775 BRW_ALIGN1_3SRC_ACCUMULATOR);
776 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
777 } else {
778 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
779 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
780 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
781 }
782 }
783 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
784
785 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
786
787 if (brw_reg_type_is_floating_point(dest.type)) {
788 brw_inst_set_3src_a1_exec_type(devinfo, inst,
789 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
790 } else {
791 brw_inst_set_3src_a1_exec_type(devinfo, inst,
792 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
793 }
794
795 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
796 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
797 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
798 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
799
800 if (src0.file == BRW_IMMEDIATE_VALUE) {
801 brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
802 } else {
803 brw_inst_set_3src_a1_src0_vstride(
804 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
805 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
806 to_3src_align1_hstride(src0.hstride));
807 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
808 if (src0.type == BRW_REGISTER_TYPE_NF) {
809 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
810 } else {
811 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
812 }
813 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
814 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
815 }
816 brw_inst_set_3src_a1_src1_vstride(
817 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
818 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
819 to_3src_align1_hstride(src1.hstride));
820
821 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
822 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
823 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
824 } else {
825 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
826 }
827 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
828 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
829
830 if (src2.file == BRW_IMMEDIATE_VALUE) {
831 brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
832 } else {
833 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
834 to_3src_align1_hstride(src2.hstride));
835 /* no vstride on src2 */
836 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
837 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
838 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
839 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
840 }
841
842 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
843 src0.file == BRW_IMMEDIATE_VALUE ||
844 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
845 src0.type == BRW_REGISTER_TYPE_NF));
846 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
847 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
848 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
849 src2.file == BRW_IMMEDIATE_VALUE);
850
851 if (devinfo->gen >= 12) {
852 if (src0.file == BRW_IMMEDIATE_VALUE) {
853 brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
854 } else {
855 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
856 }
857
858 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
859
860 if (src2.file == BRW_IMMEDIATE_VALUE) {
861 brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
862 } else {
863 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
864 }
865 } else {
866 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
867 src0.file == BRW_GENERAL_REGISTER_FILE ?
868 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
869 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
870 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
871 src1.file == BRW_GENERAL_REGISTER_FILE ?
872 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
873 BRW_ALIGN1_3SRC_ACCUMULATOR);
874 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
875 src2.file == BRW_GENERAL_REGISTER_FILE ?
876 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
877 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
878 }
879
880 } else {
881 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
882 dest.file == BRW_MESSAGE_REGISTER_FILE);
883 assert(dest.type == BRW_REGISTER_TYPE_F ||
884 dest.type == BRW_REGISTER_TYPE_DF ||
885 dest.type == BRW_REGISTER_TYPE_D ||
886 dest.type == BRW_REGISTER_TYPE_UD ||
887 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
888 if (devinfo->gen == 6) {
889 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
890 dest.file == BRW_MESSAGE_REGISTER_FILE);
891 }
892 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
893 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
894 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
895
896 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
897 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
898 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
899 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
900 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
901 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
902 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
903 src0.vstride == BRW_VERTICAL_STRIDE_0);
904
905 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
906 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
907 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
908 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
909 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
910 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
911 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
912 src1.vstride == BRW_VERTICAL_STRIDE_0);
913
914 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
915 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
916 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
917 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
918 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
919 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
920 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
921 src2.vstride == BRW_VERTICAL_STRIDE_0);
922
923 if (devinfo->gen >= 7) {
924 /* Set both the source and destination types based on dest.type,
925 * ignoring the source register types. The MAD and LRP emitters ensure
926 * that all four types are float. The BFE and BFI2 emitters, however,
927 * may send us mixed D and UD types and want us to ignore that and use
928 * the destination type.
929 */
930 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
931 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
932
933 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
934 *
935 * "Three source instructions can use operands with mixed-mode
936 * precision. When SrcType field is set to :f or :hf it defines
937 * precision for source 0 only, and fields Src1Type and Src2Type
938 * define precision for other source operands:
939 *
940 * 0b = :f. Single precision Float (32-bit).
941 * 1b = :hf. Half precision Float (16-bit)."
942 */
943 if (src1.type == BRW_REGISTER_TYPE_HF)
944 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
945
946 if (src2.type == BRW_REGISTER_TYPE_HF)
947 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
948 }
949 }
950
951 return inst;
952 }
953
954
955 /***********************************************************************
956 * Convenience routines.
957 */
958 #define ALU1(OP) \
959 brw_inst *brw_##OP(struct brw_codegen *p, \
960 struct brw_reg dest, \
961 struct brw_reg src0) \
962 { \
963 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
964 }
965
966 #define ALU2(OP) \
967 brw_inst *brw_##OP(struct brw_codegen *p, \
968 struct brw_reg dest, \
969 struct brw_reg src0, \
970 struct brw_reg src1) \
971 { \
972 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
973 }
974
975 #define ALU3(OP) \
976 brw_inst *brw_##OP(struct brw_codegen *p, \
977 struct brw_reg dest, \
978 struct brw_reg src0, \
979 struct brw_reg src1, \
980 struct brw_reg src2) \
981 { \
982 if (p->current->access_mode == BRW_ALIGN_16) { \
983 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
984 src0.swizzle = BRW_SWIZZLE_XXXX; \
985 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
986 src1.swizzle = BRW_SWIZZLE_XXXX; \
987 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
988 src2.swizzle = BRW_SWIZZLE_XXXX; \
989 } \
990 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
991 }
992
993 #define ALU3F(OP) \
994 brw_inst *brw_##OP(struct brw_codegen *p, \
995 struct brw_reg dest, \
996 struct brw_reg src0, \
997 struct brw_reg src1, \
998 struct brw_reg src2) \
999 { \
1000 assert(dest.type == BRW_REGISTER_TYPE_F || \
1001 dest.type == BRW_REGISTER_TYPE_DF); \
1002 if (dest.type == BRW_REGISTER_TYPE_F) { \
1003 assert(src0.type == BRW_REGISTER_TYPE_F); \
1004 assert(src1.type == BRW_REGISTER_TYPE_F); \
1005 assert(src2.type == BRW_REGISTER_TYPE_F); \
1006 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1007 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1008 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1009 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1010 } \
1011 \
1012 if (p->current->access_mode == BRW_ALIGN_16) { \
1013 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1014 src0.swizzle = BRW_SWIZZLE_XXXX; \
1015 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1016 src1.swizzle = BRW_SWIZZLE_XXXX; \
1017 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1018 src2.swizzle = BRW_SWIZZLE_XXXX; \
1019 } \
1020 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1021 }
1022
1023 ALU2(SEL)
1024 ALU1(NOT)
1025 ALU2(AND)
1026 ALU2(OR)
1027 ALU2(XOR)
1028 ALU2(SHR)
1029 ALU2(SHL)
1030 ALU1(DIM)
1031 ALU2(ASR)
1032 ALU2(ROL)
1033 ALU2(ROR)
1034 ALU3(CSEL)
1035 ALU1(FRC)
1036 ALU1(RNDD)
1037 ALU1(RNDE)
1038 ALU1(RNDZ)
1039 ALU2(MAC)
1040 ALU2(MACH)
1041 ALU1(LZD)
1042 ALU2(DP4)
1043 ALU2(DPH)
1044 ALU2(DP3)
1045 ALU2(DP2)
1046 ALU3(MAD)
1047 ALU3F(LRP)
1048 ALU1(BFREV)
1049 ALU3(BFE)
1050 ALU2(BFI1)
1051 ALU3(BFI2)
1052 ALU1(FBH)
1053 ALU1(FBL)
1054 ALU1(CBIT)
1055 ALU2(ADDC)
1056 ALU2(SUBB)
1057
1058 brw_inst *
1059 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1060 {
1061 const struct gen_device_info *devinfo = p->devinfo;
1062
1063 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1064 * To avoid the problems that causes, we use an <X,2,0> source region to
1065 * read each element twice.
1066 */
1067 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1068 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1069 dest.type == BRW_REGISTER_TYPE_DF &&
1070 (src0.type == BRW_REGISTER_TYPE_F ||
1071 src0.type == BRW_REGISTER_TYPE_D ||
1072 src0.type == BRW_REGISTER_TYPE_UD) &&
1073 !has_scalar_region(src0)) {
1074 assert(src0.vstride == src0.width + src0.hstride);
1075 src0.vstride = src0.hstride;
1076 src0.width = BRW_WIDTH_2;
1077 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1078 }
1079
1080 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1081 }
1082
1083 brw_inst *
1084 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1085 struct brw_reg src0, struct brw_reg src1)
1086 {
1087 /* 6.2.2: add */
1088 if (src0.type == BRW_REGISTER_TYPE_F ||
1089 (src0.file == BRW_IMMEDIATE_VALUE &&
1090 src0.type == BRW_REGISTER_TYPE_VF)) {
1091 assert(src1.type != BRW_REGISTER_TYPE_UD);
1092 assert(src1.type != BRW_REGISTER_TYPE_D);
1093 }
1094
1095 if (src1.type == BRW_REGISTER_TYPE_F ||
1096 (src1.file == BRW_IMMEDIATE_VALUE &&
1097 src1.type == BRW_REGISTER_TYPE_VF)) {
1098 assert(src0.type != BRW_REGISTER_TYPE_UD);
1099 assert(src0.type != BRW_REGISTER_TYPE_D);
1100 }
1101
1102 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1103 }
1104
1105 brw_inst *
1106 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1107 struct brw_reg src0, struct brw_reg src1)
1108 {
1109 assert(dest.type == src0.type);
1110 assert(src0.type == src1.type);
1111 switch (src0.type) {
1112 case BRW_REGISTER_TYPE_B:
1113 case BRW_REGISTER_TYPE_UB:
1114 case BRW_REGISTER_TYPE_W:
1115 case BRW_REGISTER_TYPE_UW:
1116 case BRW_REGISTER_TYPE_D:
1117 case BRW_REGISTER_TYPE_UD:
1118 break;
1119 default:
1120 unreachable("Bad type for brw_AVG");
1121 }
1122
1123 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1124 }
1125
1126 brw_inst *
1127 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1128 struct brw_reg src0, struct brw_reg src1)
1129 {
1130 /* 6.32.38: mul */
1131 if (src0.type == BRW_REGISTER_TYPE_D ||
1132 src0.type == BRW_REGISTER_TYPE_UD ||
1133 src1.type == BRW_REGISTER_TYPE_D ||
1134 src1.type == BRW_REGISTER_TYPE_UD) {
1135 assert(dest.type != BRW_REGISTER_TYPE_F);
1136 }
1137
1138 if (src0.type == BRW_REGISTER_TYPE_F ||
1139 (src0.file == BRW_IMMEDIATE_VALUE &&
1140 src0.type == BRW_REGISTER_TYPE_VF)) {
1141 assert(src1.type != BRW_REGISTER_TYPE_UD);
1142 assert(src1.type != BRW_REGISTER_TYPE_D);
1143 }
1144
1145 if (src1.type == BRW_REGISTER_TYPE_F ||
1146 (src1.file == BRW_IMMEDIATE_VALUE &&
1147 src1.type == BRW_REGISTER_TYPE_VF)) {
1148 assert(src0.type != BRW_REGISTER_TYPE_UD);
1149 assert(src0.type != BRW_REGISTER_TYPE_D);
1150 }
1151
1152 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1153 src0.nr != BRW_ARF_ACCUMULATOR);
1154 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1155 src1.nr != BRW_ARF_ACCUMULATOR);
1156
1157 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1158 }
1159
1160 brw_inst *
1161 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1162 struct brw_reg src0, struct brw_reg src1)
1163 {
1164 src0.vstride = BRW_VERTICAL_STRIDE_0;
1165 src0.width = BRW_WIDTH_1;
1166 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1167 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1168 }
1169
1170 brw_inst *
1171 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1172 struct brw_reg src0, struct brw_reg src1)
1173 {
1174 src0.vstride = BRW_VERTICAL_STRIDE_0;
1175 src0.width = BRW_WIDTH_1;
1176 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1177 src1.vstride = BRW_VERTICAL_STRIDE_8;
1178 src1.width = BRW_WIDTH_8;
1179 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1180 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1181 }
1182
1183 brw_inst *
1184 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1185 {
1186 const struct gen_device_info *devinfo = p->devinfo;
1187 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1188 /* The F32TO16 instruction doesn't support 32-bit destination types in
1189 * Align1 mode, and neither does the Gen8 implementation in terms of a
1190 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1191 * an undocumented feature.
1192 */
1193 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1194 (!align16 || devinfo->gen >= 8));
1195 brw_inst *inst;
1196
1197 if (align16) {
1198 assert(dst.type == BRW_REGISTER_TYPE_UD);
1199 } else {
1200 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1201 dst.type == BRW_REGISTER_TYPE_W ||
1202 dst.type == BRW_REGISTER_TYPE_UW ||
1203 dst.type == BRW_REGISTER_TYPE_HF);
1204 }
1205
1206 brw_push_insn_state(p);
1207
1208 if (needs_zero_fill) {
1209 brw_set_default_access_mode(p, BRW_ALIGN_1);
1210 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1211 }
1212
1213 if (devinfo->gen >= 8) {
1214 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1215 } else {
1216 assert(devinfo->gen == 7);
1217 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1218 }
1219
1220 if (needs_zero_fill) {
1221 if (devinfo->gen < 12)
1222 brw_inst_set_no_dd_clear(devinfo, inst, true);
1223 brw_set_default_swsb(p, tgl_swsb_null());
1224 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1225 if (devinfo->gen < 12)
1226 brw_inst_set_no_dd_check(devinfo, inst, true);
1227 }
1228
1229 brw_pop_insn_state(p);
1230 return inst;
1231 }
1232
1233 brw_inst *
1234 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1235 {
1236 const struct gen_device_info *devinfo = p->devinfo;
1237 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1238
1239 if (align16) {
1240 assert(src.type == BRW_REGISTER_TYPE_UD);
1241 } else {
1242 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1243 *
1244 * Because this instruction does not have a 16-bit floating-point
1245 * type, the source data type must be Word (W). The destination type
1246 * must be F (Float).
1247 */
1248 if (src.type == BRW_REGISTER_TYPE_UD)
1249 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1250
1251 assert(src.type == BRW_REGISTER_TYPE_W ||
1252 src.type == BRW_REGISTER_TYPE_UW ||
1253 src.type == BRW_REGISTER_TYPE_HF);
1254 }
1255
1256 if (devinfo->gen >= 8) {
1257 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1258 } else {
1259 assert(devinfo->gen == 7);
1260 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1261 }
1262 }
1263
1264
1265 void brw_NOP(struct brw_codegen *p)
1266 {
1267 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1268 memset(insn, 0, sizeof(*insn));
1269 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1270 }
1271
1272 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1273 {
1274 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1275 brw_inst_set_cond_modifier(p->devinfo, insn, func);
1276 }
1277
1278 /***********************************************************************
1279 * Comparisons, if/else/endif
1280 */
1281
1282 brw_inst *
1283 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1284 unsigned predicate_control)
1285 {
1286 const struct gen_device_info *devinfo = p->devinfo;
1287 struct brw_reg ip = brw_ip_reg();
1288 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1289
1290 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1291 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1292 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1293 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1294
1295 return inst;
1296 }
1297
1298 static void
1299 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1300 {
1301 p->if_stack[p->if_stack_depth] = inst - p->store;
1302
1303 p->if_stack_depth++;
1304 if (p->if_stack_array_size <= p->if_stack_depth) {
1305 p->if_stack_array_size *= 2;
1306 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1307 p->if_stack_array_size);
1308 }
1309 }
1310
1311 static brw_inst *
1312 pop_if_stack(struct brw_codegen *p)
1313 {
1314 p->if_stack_depth--;
1315 return &p->store[p->if_stack[p->if_stack_depth]];
1316 }
1317
1318 static void
1319 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1320 {
1321 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1322 p->loop_stack_array_size *= 2;
1323 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1324 p->loop_stack_array_size);
1325 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1326 p->loop_stack_array_size);
1327 }
1328
1329 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1330 p->loop_stack_depth++;
1331 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1332 }
1333
1334 static brw_inst *
1335 get_inner_do_insn(struct brw_codegen *p)
1336 {
1337 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1338 }
1339
1340 /* EU takes the value from the flag register and pushes it onto some
1341 * sort of a stack (presumably merging with any flag value already on
1342 * the stack). Within an if block, the flags at the top of the stack
1343 * control execution on each channel of the unit, eg. on each of the
1344 * 16 pixel values in our wm programs.
1345 *
1346 * When the matching 'else' instruction is reached (presumably by
1347 * countdown of the instruction count patched in by our ELSE/ENDIF
1348 * functions), the relevant flags are inverted.
1349 *
1350 * When the matching 'endif' instruction is reached, the flags are
1351 * popped off. If the stack is now empty, normal execution resumes.
1352 */
1353 brw_inst *
1354 brw_IF(struct brw_codegen *p, unsigned execute_size)
1355 {
1356 const struct gen_device_info *devinfo = p->devinfo;
1357 brw_inst *insn;
1358
1359 insn = next_insn(p, BRW_OPCODE_IF);
1360
1361 /* Override the defaults for this instruction:
1362 */
1363 if (devinfo->gen < 6) {
1364 brw_set_dest(p, insn, brw_ip_reg());
1365 brw_set_src0(p, insn, brw_ip_reg());
1366 brw_set_src1(p, insn, brw_imm_d(0x0));
1367 } else if (devinfo->gen == 6) {
1368 brw_set_dest(p, insn, brw_imm_w(0));
1369 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1370 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1371 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1372 } else if (devinfo->gen == 7) {
1373 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1374 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1375 brw_set_src1(p, insn, brw_imm_w(0));
1376 brw_inst_set_jip(devinfo, insn, 0);
1377 brw_inst_set_uip(devinfo, insn, 0);
1378 } else {
1379 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1380 if (devinfo->gen < 12)
1381 brw_set_src0(p, insn, brw_imm_d(0));
1382 brw_inst_set_jip(devinfo, insn, 0);
1383 brw_inst_set_uip(devinfo, insn, 0);
1384 }
1385
1386 brw_inst_set_exec_size(devinfo, insn, execute_size);
1387 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1388 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1389 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1390 if (!p->single_program_flow && devinfo->gen < 6)
1391 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1392
1393 push_if_stack(p, insn);
1394 p->if_depth_in_loop[p->loop_stack_depth]++;
1395 return insn;
1396 }
1397
1398 /* This function is only used for gen6-style IF instructions with an
1399 * embedded comparison (conditional modifier). It is not used on gen7.
1400 */
1401 brw_inst *
1402 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1403 struct brw_reg src0, struct brw_reg src1)
1404 {
1405 const struct gen_device_info *devinfo = p->devinfo;
1406 brw_inst *insn;
1407
1408 insn = next_insn(p, BRW_OPCODE_IF);
1409
1410 brw_set_dest(p, insn, brw_imm_w(0));
1411 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1412 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1413 brw_set_src0(p, insn, src0);
1414 brw_set_src1(p, insn, src1);
1415
1416 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1417 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1418 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1419
1420 push_if_stack(p, insn);
1421 return insn;
1422 }
1423
1424 /**
1425 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1426 */
1427 static void
1428 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1429 brw_inst *if_inst, brw_inst *else_inst)
1430 {
1431 const struct gen_device_info *devinfo = p->devinfo;
1432
1433 /* The next instruction (where the ENDIF would be, if it existed) */
1434 brw_inst *next_inst = &p->store[p->nr_insn];
1435
1436 assert(p->single_program_flow);
1437 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1438 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1439 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1440
1441 /* Convert IF to an ADD instruction that moves the instruction pointer
1442 * to the first instruction of the ELSE block. If there is no ELSE
1443 * block, point to where ENDIF would be. Reverse the predicate.
1444 *
1445 * There's no need to execute an ENDIF since we don't need to do any
1446 * stack operations, and if we're currently executing, we just want to
1447 * continue normally.
1448 */
1449 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1450 brw_inst_set_pred_inv(devinfo, if_inst, true);
1451
1452 if (else_inst != NULL) {
1453 /* Convert ELSE to an ADD instruction that points where the ENDIF
1454 * would be.
1455 */
1456 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1457
1458 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1459 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1460 } else {
1461 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1462 }
1463 }
1464
1465 /**
1466 * Patch IF and ELSE instructions with appropriate jump targets.
1467 */
1468 static void
1469 patch_IF_ELSE(struct brw_codegen *p,
1470 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1471 {
1472 const struct gen_device_info *devinfo = p->devinfo;
1473
1474 /* We shouldn't be patching IF and ELSE instructions in single program flow
1475 * mode when gen < 6, because in single program flow mode on those
1476 * platforms, we convert flow control instructions to conditional ADDs that
1477 * operate on IP (see brw_ENDIF).
1478 *
1479 * However, on Gen6, writing to IP doesn't work in single program flow mode
1480 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1481 * not be updated by non-flow control instructions."). And on later
1482 * platforms, there is no significant benefit to converting control flow
1483 * instructions to conditional ADDs. So we do patch IF and ELSE
1484 * instructions in single program flow mode on those platforms.
1485 */
1486 if (devinfo->gen < 6)
1487 assert(!p->single_program_flow);
1488
1489 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1490 assert(endif_inst != NULL);
1491 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1492
1493 unsigned br = brw_jump_scale(devinfo);
1494
1495 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1496 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1497
1498 if (else_inst == NULL) {
1499 /* Patch IF -> ENDIF */
1500 if (devinfo->gen < 6) {
1501 /* Turn it into an IFF, which means no mask stack operations for
1502 * all-false and jumping past the ENDIF.
1503 */
1504 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1505 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1506 br * (endif_inst - if_inst + 1));
1507 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1508 } else if (devinfo->gen == 6) {
1509 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1510 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1511 } else {
1512 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1513 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1514 }
1515 } else {
1516 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1517
1518 /* Patch IF -> ELSE */
1519 if (devinfo->gen < 6) {
1520 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1521 br * (else_inst - if_inst));
1522 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1523 } else if (devinfo->gen == 6) {
1524 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1525 br * (else_inst - if_inst + 1));
1526 }
1527
1528 /* Patch ELSE -> ENDIF */
1529 if (devinfo->gen < 6) {
1530 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1531 * matching ENDIF.
1532 */
1533 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1534 br * (endif_inst - else_inst + 1));
1535 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1536 } else if (devinfo->gen == 6) {
1537 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1538 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1539 br * (endif_inst - else_inst));
1540 } else {
1541 /* The IF instruction's JIP should point just past the ELSE */
1542 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1543 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1544 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1545 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1546 if (devinfo->gen >= 8) {
1547 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1548 * should point to ENDIF.
1549 */
1550 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1551 }
1552 }
1553 }
1554 }
1555
1556 void
1557 brw_ELSE(struct brw_codegen *p)
1558 {
1559 const struct gen_device_info *devinfo = p->devinfo;
1560 brw_inst *insn;
1561
1562 insn = next_insn(p, BRW_OPCODE_ELSE);
1563
1564 if (devinfo->gen < 6) {
1565 brw_set_dest(p, insn, brw_ip_reg());
1566 brw_set_src0(p, insn, brw_ip_reg());
1567 brw_set_src1(p, insn, brw_imm_d(0x0));
1568 } else if (devinfo->gen == 6) {
1569 brw_set_dest(p, insn, brw_imm_w(0));
1570 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1571 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1572 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1573 } else if (devinfo->gen == 7) {
1574 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1575 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1576 brw_set_src1(p, insn, brw_imm_w(0));
1577 brw_inst_set_jip(devinfo, insn, 0);
1578 brw_inst_set_uip(devinfo, insn, 0);
1579 } else {
1580 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1581 if (devinfo->gen < 12)
1582 brw_set_src0(p, insn, brw_imm_d(0));
1583 brw_inst_set_jip(devinfo, insn, 0);
1584 brw_inst_set_uip(devinfo, insn, 0);
1585 }
1586
1587 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1588 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1589 if (!p->single_program_flow && devinfo->gen < 6)
1590 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1591
1592 push_if_stack(p, insn);
1593 }
1594
1595 void
1596 brw_ENDIF(struct brw_codegen *p)
1597 {
1598 const struct gen_device_info *devinfo = p->devinfo;
1599 brw_inst *insn = NULL;
1600 brw_inst *else_inst = NULL;
1601 brw_inst *if_inst = NULL;
1602 brw_inst *tmp;
1603 bool emit_endif = true;
1604
1605 /* In single program flow mode, we can express IF and ELSE instructions
1606 * equivalently as ADD instructions that operate on IP. On platforms prior
1607 * to Gen6, flow control instructions cause an implied thread switch, so
1608 * this is a significant savings.
1609 *
1610 * However, on Gen6, writing to IP doesn't work in single program flow mode
1611 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1612 * not be updated by non-flow control instructions."). And on later
1613 * platforms, there is no significant benefit to converting control flow
1614 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1615 * Gen5.
1616 */
1617 if (devinfo->gen < 6 && p->single_program_flow)
1618 emit_endif = false;
1619
1620 /*
1621 * A single next_insn() may change the base address of instruction store
1622 * memory(p->store), so call it first before referencing the instruction
1623 * store pointer from an index
1624 */
1625 if (emit_endif)
1626 insn = next_insn(p, BRW_OPCODE_ENDIF);
1627
1628 /* Pop the IF and (optional) ELSE instructions from the stack */
1629 p->if_depth_in_loop[p->loop_stack_depth]--;
1630 tmp = pop_if_stack(p);
1631 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1632 else_inst = tmp;
1633 tmp = pop_if_stack(p);
1634 }
1635 if_inst = tmp;
1636
1637 if (!emit_endif) {
1638 /* ENDIF is useless; don't bother emitting it. */
1639 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1640 return;
1641 }
1642
1643 if (devinfo->gen < 6) {
1644 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1645 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1646 brw_set_src1(p, insn, brw_imm_d(0x0));
1647 } else if (devinfo->gen == 6) {
1648 brw_set_dest(p, insn, brw_imm_w(0));
1649 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1650 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1651 } else if (devinfo->gen == 7) {
1652 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1653 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1654 brw_set_src1(p, insn, brw_imm_w(0));
1655 } else {
1656 brw_set_src0(p, insn, brw_imm_d(0));
1657 }
1658
1659 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1660 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1661 if (devinfo->gen < 6)
1662 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1663
1664 /* Also pop item off the stack in the endif instruction: */
1665 if (devinfo->gen < 6) {
1666 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1667 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1668 } else if (devinfo->gen == 6) {
1669 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1670 } else {
1671 brw_inst_set_jip(devinfo, insn, 2);
1672 }
1673 patch_IF_ELSE(p, if_inst, else_inst, insn);
1674 }
1675
1676 brw_inst *
1677 brw_BREAK(struct brw_codegen *p)
1678 {
1679 const struct gen_device_info *devinfo = p->devinfo;
1680 brw_inst *insn;
1681
1682 insn = next_insn(p, BRW_OPCODE_BREAK);
1683 if (devinfo->gen >= 8) {
1684 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1685 brw_set_src0(p, insn, brw_imm_d(0x0));
1686 } else if (devinfo->gen >= 6) {
1687 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1688 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1689 brw_set_src1(p, insn, brw_imm_d(0x0));
1690 } else {
1691 brw_set_dest(p, insn, brw_ip_reg());
1692 brw_set_src0(p, insn, brw_ip_reg());
1693 brw_set_src1(p, insn, brw_imm_d(0x0));
1694 brw_inst_set_gen4_pop_count(devinfo, insn,
1695 p->if_depth_in_loop[p->loop_stack_depth]);
1696 }
1697 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1698 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1699
1700 return insn;
1701 }
1702
1703 brw_inst *
1704 brw_CONT(struct brw_codegen *p)
1705 {
1706 const struct gen_device_info *devinfo = p->devinfo;
1707 brw_inst *insn;
1708
1709 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1710 brw_set_dest(p, insn, brw_ip_reg());
1711 if (devinfo->gen >= 8) {
1712 brw_set_src0(p, insn, brw_imm_d(0x0));
1713 } else {
1714 brw_set_src0(p, insn, brw_ip_reg());
1715 brw_set_src1(p, insn, brw_imm_d(0x0));
1716 }
1717
1718 if (devinfo->gen < 6) {
1719 brw_inst_set_gen4_pop_count(devinfo, insn,
1720 p->if_depth_in_loop[p->loop_stack_depth]);
1721 }
1722 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1723 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1724 return insn;
1725 }
1726
1727 brw_inst *
1728 brw_HALT(struct brw_codegen *p)
1729 {
1730 const struct gen_device_info *devinfo = p->devinfo;
1731 brw_inst *insn;
1732
1733 insn = next_insn(p, BRW_OPCODE_HALT);
1734 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1735 if (devinfo->gen < 6) {
1736 /* From the Gen4 PRM:
1737 *
1738 * "IP register must be put (for example, by the assembler) at <dst>
1739 * and <src0> locations.
1740 */
1741 brw_set_dest(p, insn, brw_ip_reg());
1742 brw_set_src0(p, insn, brw_ip_reg());
1743 brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1744 } else if (devinfo->gen < 8) {
1745 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1746 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1747 } else if (devinfo->gen < 12) {
1748 brw_set_src0(p, insn, brw_imm_d(0x0));
1749 }
1750
1751 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1752 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1753 return insn;
1754 }
1755
1756 /* DO/WHILE loop:
1757 *
1758 * The DO/WHILE is just an unterminated loop -- break or continue are
1759 * used for control within the loop. We have a few ways they can be
1760 * done.
1761 *
1762 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1763 * jip and no DO instruction.
1764 *
1765 * For non-uniform control flow pre-gen6, there's a DO instruction to
1766 * push the mask, and a WHILE to jump back, and BREAK to get out and
1767 * pop the mask.
1768 *
1769 * For gen6, there's no more mask stack, so no need for DO. WHILE
1770 * just points back to the first instruction of the loop.
1771 */
1772 brw_inst *
1773 brw_DO(struct brw_codegen *p, unsigned execute_size)
1774 {
1775 const struct gen_device_info *devinfo = p->devinfo;
1776
1777 if (devinfo->gen >= 6 || p->single_program_flow) {
1778 push_loop_stack(p, &p->store[p->nr_insn]);
1779 return &p->store[p->nr_insn];
1780 } else {
1781 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1782
1783 push_loop_stack(p, insn);
1784
1785 /* Override the defaults for this instruction:
1786 */
1787 brw_set_dest(p, insn, brw_null_reg());
1788 brw_set_src0(p, insn, brw_null_reg());
1789 brw_set_src1(p, insn, brw_null_reg());
1790
1791 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1792 brw_inst_set_exec_size(devinfo, insn, execute_size);
1793 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1794
1795 return insn;
1796 }
1797 }
1798
1799 /**
1800 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1801 * instruction here.
1802 *
1803 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1804 * nesting, since it can always just point to the end of the block/current loop.
1805 */
1806 static void
1807 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1808 {
1809 const struct gen_device_info *devinfo = p->devinfo;
1810 brw_inst *do_inst = get_inner_do_insn(p);
1811 brw_inst *inst;
1812 unsigned br = brw_jump_scale(devinfo);
1813
1814 assert(devinfo->gen < 6);
1815
1816 for (inst = while_inst - 1; inst != do_inst; inst--) {
1817 /* If the jump count is != 0, that means that this instruction has already
1818 * been patched because it's part of a loop inside of the one we're
1819 * patching.
1820 */
1821 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1822 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1823 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1824 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1825 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1826 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1827 }
1828 }
1829 }
1830
1831 brw_inst *
1832 brw_WHILE(struct brw_codegen *p)
1833 {
1834 const struct gen_device_info *devinfo = p->devinfo;
1835 brw_inst *insn, *do_insn;
1836 unsigned br = brw_jump_scale(devinfo);
1837
1838 if (devinfo->gen >= 6) {
1839 insn = next_insn(p, BRW_OPCODE_WHILE);
1840 do_insn = get_inner_do_insn(p);
1841
1842 if (devinfo->gen >= 8) {
1843 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1844 if (devinfo->gen < 12)
1845 brw_set_src0(p, insn, brw_imm_d(0));
1846 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1847 } else if (devinfo->gen == 7) {
1848 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1849 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1850 brw_set_src1(p, insn, brw_imm_w(0));
1851 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1852 } else {
1853 brw_set_dest(p, insn, brw_imm_w(0));
1854 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1855 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1856 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1857 }
1858
1859 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1860
1861 } else {
1862 if (p->single_program_flow) {
1863 insn = next_insn(p, BRW_OPCODE_ADD);
1864 do_insn = get_inner_do_insn(p);
1865
1866 brw_set_dest(p, insn, brw_ip_reg());
1867 brw_set_src0(p, insn, brw_ip_reg());
1868 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1869 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1870 } else {
1871 insn = next_insn(p, BRW_OPCODE_WHILE);
1872 do_insn = get_inner_do_insn(p);
1873
1874 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1875
1876 brw_set_dest(p, insn, brw_ip_reg());
1877 brw_set_src0(p, insn, brw_ip_reg());
1878 brw_set_src1(p, insn, brw_imm_d(0));
1879
1880 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1881 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1882 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1883
1884 brw_patch_break_cont(p, insn);
1885 }
1886 }
1887 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1888
1889 p->loop_stack_depth--;
1890
1891 return insn;
1892 }
1893
1894 /* FORWARD JUMPS:
1895 */
1896 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1897 {
1898 const struct gen_device_info *devinfo = p->devinfo;
1899 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1900 unsigned jmpi = 1;
1901
1902 if (devinfo->gen >= 5)
1903 jmpi = 2;
1904
1905 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1906 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1907
1908 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1909 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1910 }
1911
1912 /* To integrate with the above, it makes sense that the comparison
1913 * instruction should populate the flag register. It might be simpler
1914 * just to use the flag reg for most WM tasks?
1915 */
1916 void brw_CMP(struct brw_codegen *p,
1917 struct brw_reg dest,
1918 unsigned conditional,
1919 struct brw_reg src0,
1920 struct brw_reg src1)
1921 {
1922 const struct gen_device_info *devinfo = p->devinfo;
1923 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1924
1925 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1926 brw_set_dest(p, insn, dest);
1927 brw_set_src0(p, insn, src0);
1928 brw_set_src1(p, insn, src1);
1929
1930 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1931 * page says:
1932 * "Any CMP instruction with a null destination must use a {switch}."
1933 *
1934 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1935 * mentioned on their work-arounds pages.
1936 */
1937 if (devinfo->gen == 7) {
1938 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1939 dest.nr == BRW_ARF_NULL) {
1940 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1941 }
1942 }
1943 }
1944
1945 /***********************************************************************
1946 * Helpers for the various SEND message types:
1947 */
1948
1949 /** Extended math function, float[8].
1950 */
1951 void gen4_math(struct brw_codegen *p,
1952 struct brw_reg dest,
1953 unsigned function,
1954 unsigned msg_reg_nr,
1955 struct brw_reg src,
1956 unsigned precision )
1957 {
1958 const struct gen_device_info *devinfo = p->devinfo;
1959 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1960 unsigned data_type;
1961 if (has_scalar_region(src)) {
1962 data_type = BRW_MATH_DATA_SCALAR;
1963 } else {
1964 data_type = BRW_MATH_DATA_VECTOR;
1965 }
1966
1967 assert(devinfo->gen < 6);
1968
1969 /* Example code doesn't set predicate_control for send
1970 * instructions.
1971 */
1972 brw_inst_set_pred_control(devinfo, insn, 0);
1973 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1974
1975 brw_set_dest(p, insn, dest);
1976 brw_set_src0(p, insn, src);
1977 brw_set_math_message(p,
1978 insn,
1979 function,
1980 src.type == BRW_REGISTER_TYPE_D,
1981 precision,
1982 data_type);
1983 }
1984
1985 void gen6_math(struct brw_codegen *p,
1986 struct brw_reg dest,
1987 unsigned function,
1988 struct brw_reg src0,
1989 struct brw_reg src1)
1990 {
1991 const struct gen_device_info *devinfo = p->devinfo;
1992 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1993
1994 assert(devinfo->gen >= 6);
1995
1996 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1997 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1998
1999 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2000 if (devinfo->gen == 6) {
2001 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2002 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2003 }
2004
2005 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2006 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2007 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2008 assert(src0.type != BRW_REGISTER_TYPE_F);
2009 assert(src1.type != BRW_REGISTER_TYPE_F);
2010 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2011 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2012 } else {
2013 assert(src0.type == BRW_REGISTER_TYPE_F ||
2014 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2015 assert(src1.type == BRW_REGISTER_TYPE_F ||
2016 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2017 }
2018
2019 /* Source modifiers are ignored for extended math instructions on Gen6. */
2020 if (devinfo->gen == 6) {
2021 assert(!src0.negate);
2022 assert(!src0.abs);
2023 assert(!src1.negate);
2024 assert(!src1.abs);
2025 }
2026
2027 brw_inst_set_math_function(devinfo, insn, function);
2028
2029 brw_set_dest(p, insn, dest);
2030 brw_set_src0(p, insn, src0);
2031 brw_set_src1(p, insn, src1);
2032 }
2033
2034 /**
2035 * Return the right surface index to access the thread scratch space using
2036 * stateless dataport messages.
2037 */
2038 unsigned
2039 brw_scratch_surface_idx(const struct brw_codegen *p)
2040 {
2041 /* The scratch space is thread-local so IA coherency is unnecessary. */
2042 if (p->devinfo->gen >= 8)
2043 return GEN8_BTI_STATELESS_NON_COHERENT;
2044 else
2045 return BRW_BTI_STATELESS;
2046 }
2047
2048 /**
2049 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2050 * using a constant offset per channel.
2051 *
2052 * The offset must be aligned to oword size (16 bytes). Used for
2053 * register spilling.
2054 */
2055 void brw_oword_block_write_scratch(struct brw_codegen *p,
2056 struct brw_reg mrf,
2057 int num_regs,
2058 unsigned offset)
2059 {
2060 const struct gen_device_info *devinfo = p->devinfo;
2061 const unsigned target_cache =
2062 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2063 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2064 BRW_SFID_DATAPORT_WRITE);
2065 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2066 uint32_t msg_type;
2067
2068 if (devinfo->gen >= 6)
2069 offset /= 16;
2070
2071 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2072
2073 const unsigned mlen = 1 + num_regs;
2074
2075 /* Set up the message header. This is g0, with g0.2 filled with
2076 * the offset. We don't want to leave our offset around in g0 or
2077 * it'll screw up texture samples, so set it up inside the message
2078 * reg.
2079 */
2080 {
2081 brw_push_insn_state(p);
2082 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2083 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2084 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2085 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2086
2087 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2088
2089 /* set message header global offset field (reg 0, element 2) */
2090 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2091 brw_set_default_swsb(p, tgl_swsb_null());
2092 brw_MOV(p,
2093 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2094 mrf.nr,
2095 2), BRW_REGISTER_TYPE_UD),
2096 brw_imm_ud(offset));
2097
2098 brw_pop_insn_state(p);
2099 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2100 }
2101
2102 {
2103 struct brw_reg dest;
2104 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2105 int send_commit_msg;
2106 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2107 BRW_REGISTER_TYPE_UW);
2108
2109 brw_inst_set_sfid(devinfo, insn, target_cache);
2110 brw_inst_set_compression(devinfo, insn, false);
2111
2112 if (brw_inst_exec_size(devinfo, insn) >= 16)
2113 src_header = vec16(src_header);
2114
2115 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2116 if (devinfo->gen < 6)
2117 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2118
2119 /* Until gen6, writes followed by reads from the same location
2120 * are not guaranteed to be ordered unless write_commit is set.
2121 * If set, then a no-op write is issued to the destination
2122 * register to set a dependency, and a read from the destination
2123 * can be used to ensure the ordering.
2124 *
2125 * For gen6, only writes between different threads need ordering
2126 * protection. Our use of DP writes is all about register
2127 * spilling within a thread.
2128 */
2129 if (devinfo->gen >= 6) {
2130 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2131 send_commit_msg = 0;
2132 } else {
2133 dest = src_header;
2134 send_commit_msg = 1;
2135 }
2136
2137 brw_set_dest(p, insn, dest);
2138 if (devinfo->gen >= 6) {
2139 brw_set_src0(p, insn, mrf);
2140 } else {
2141 brw_set_src0(p, insn, brw_null_reg());
2142 }
2143
2144 if (devinfo->gen >= 6)
2145 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2146 else
2147 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2148
2149 brw_set_desc(p, insn,
2150 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2151 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2152 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2153 msg_type, 0, /* not a render target */
2154 send_commit_msg));
2155 }
2156 }
2157
2158
2159 /**
2160 * Read a block of owords (half a GRF each) from the scratch buffer
2161 * using a constant index per channel.
2162 *
2163 * Offset must be aligned to oword size (16 bytes). Used for register
2164 * spilling.
2165 */
2166 void
2167 brw_oword_block_read_scratch(struct brw_codegen *p,
2168 struct brw_reg dest,
2169 struct brw_reg mrf,
2170 int num_regs,
2171 unsigned offset)
2172 {
2173 const struct gen_device_info *devinfo = p->devinfo;
2174 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2175
2176 if (devinfo->gen >= 6)
2177 offset /= 16;
2178
2179 if (p->devinfo->gen >= 7) {
2180 /* On gen 7 and above, we no longer have message registers and we can
2181 * send from any register we want. By using the destination register
2182 * for the message, we guarantee that the implied message write won't
2183 * accidentally overwrite anything. This has been a problem because
2184 * the MRF registers and source for the final FB write are both fixed
2185 * and may overlap.
2186 */
2187 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2188 } else {
2189 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2190 }
2191 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2192
2193 const unsigned rlen = num_regs;
2194 const unsigned target_cache =
2195 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2196 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2197 BRW_SFID_DATAPORT_READ);
2198
2199 {
2200 brw_push_insn_state(p);
2201 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2202 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2203 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2204 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2205
2206 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2207
2208 /* set message header global offset field (reg 0, element 2) */
2209 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2210 brw_set_default_swsb(p, tgl_swsb_null());
2211 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2212
2213 brw_pop_insn_state(p);
2214 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2215 }
2216
2217 {
2218 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2219
2220 brw_inst_set_sfid(devinfo, insn, target_cache);
2221 assert(brw_inst_pred_control(devinfo, insn) == 0);
2222 brw_inst_set_compression(devinfo, insn, false);
2223
2224 brw_set_dest(p, insn, dest); /* UW? */
2225 if (devinfo->gen >= 6) {
2226 brw_set_src0(p, insn, mrf);
2227 } else {
2228 brw_set_src0(p, insn, brw_null_reg());
2229 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2230 }
2231
2232 brw_set_desc(p, insn,
2233 brw_message_desc(devinfo, 1, rlen, true) |
2234 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2235 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2236 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2237 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2238 }
2239 }
2240
2241 void
2242 gen7_block_read_scratch(struct brw_codegen *p,
2243 struct brw_reg dest,
2244 int num_regs,
2245 unsigned offset)
2246 {
2247 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2248 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2249
2250 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2251
2252 /* The HW requires that the header is present; this is to get the g0.5
2253 * scratch offset.
2254 */
2255 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2256
2257 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2258 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2259 * is 32 bytes, which happens to be the size of a register.
2260 */
2261 offset /= REG_SIZE;
2262 assert(offset < (1 << 12));
2263
2264 gen7_set_dp_scratch_message(p, insn,
2265 false, /* scratch read */
2266 false, /* OWords */
2267 false, /* invalidate after read */
2268 num_regs,
2269 offset,
2270 1, /* mlen: just g0 */
2271 num_regs, /* rlen */
2272 true); /* header present */
2273 }
2274
2275 /**
2276 * Read float[4] vectors from the data port constant cache.
2277 * Location (in buffer) should be a multiple of 16.
2278 * Used for fetching shader constants.
2279 */
2280 void brw_oword_block_read(struct brw_codegen *p,
2281 struct brw_reg dest,
2282 struct brw_reg mrf,
2283 uint32_t offset,
2284 uint32_t bind_table_index)
2285 {
2286 const struct gen_device_info *devinfo = p->devinfo;
2287 const unsigned target_cache =
2288 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2289 BRW_SFID_DATAPORT_READ);
2290 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2291 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2292
2293 /* On newer hardware, offset is in units of owords. */
2294 if (devinfo->gen >= 6)
2295 offset /= 16;
2296
2297 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2298
2299 brw_push_insn_state(p);
2300 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2301 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2302 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2303
2304 brw_push_insn_state(p);
2305 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2306 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2307 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2308
2309 /* set message header global offset field (reg 0, element 2) */
2310 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2311 brw_set_default_swsb(p, tgl_swsb_null());
2312 brw_MOV(p,
2313 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2314 mrf.nr,
2315 2), BRW_REGISTER_TYPE_UD),
2316 brw_imm_ud(offset));
2317 brw_pop_insn_state(p);
2318
2319 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2320
2321 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2322
2323 brw_inst_set_sfid(devinfo, insn, target_cache);
2324
2325 /* cast dest to a uword[8] vector */
2326 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2327
2328 brw_set_dest(p, insn, dest);
2329 if (devinfo->gen >= 6) {
2330 brw_set_src0(p, insn, mrf);
2331 } else {
2332 brw_set_src0(p, insn, brw_null_reg());
2333 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2334 }
2335
2336 brw_set_desc(p, insn,
2337 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2338 brw_dp_read_desc(devinfo, bind_table_index,
2339 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2340 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2341 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2342
2343 brw_pop_insn_state(p);
2344 }
2345
2346 brw_inst *
2347 brw_fb_WRITE(struct brw_codegen *p,
2348 struct brw_reg payload,
2349 struct brw_reg implied_header,
2350 unsigned msg_control,
2351 unsigned binding_table_index,
2352 unsigned msg_length,
2353 unsigned response_length,
2354 bool eot,
2355 bool last_render_target,
2356 bool header_present)
2357 {
2358 const struct gen_device_info *devinfo = p->devinfo;
2359 const unsigned target_cache =
2360 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2361 BRW_SFID_DATAPORT_WRITE);
2362 brw_inst *insn;
2363 unsigned msg_type;
2364 struct brw_reg dest, src0;
2365
2366 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2367 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2368 else
2369 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2370
2371 if (devinfo->gen >= 6) {
2372 insn = next_insn(p, BRW_OPCODE_SENDC);
2373 } else {
2374 insn = next_insn(p, BRW_OPCODE_SEND);
2375 }
2376 brw_inst_set_sfid(devinfo, insn, target_cache);
2377 brw_inst_set_compression(devinfo, insn, false);
2378
2379 if (devinfo->gen >= 6) {
2380 /* headerless version, just submit color payload */
2381 src0 = payload;
2382
2383 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2384 } else {
2385 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2386 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2387 src0 = implied_header;
2388
2389 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2390 }
2391
2392 brw_set_dest(p, insn, dest);
2393 brw_set_src0(p, insn, src0);
2394 brw_set_desc(p, insn,
2395 brw_message_desc(devinfo, msg_length, response_length,
2396 header_present) |
2397 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2398 msg_type, last_render_target,
2399 0 /* send_commit_msg */));
2400 brw_inst_set_eot(devinfo, insn, eot);
2401
2402 return insn;
2403 }
2404
2405 brw_inst *
2406 gen9_fb_READ(struct brw_codegen *p,
2407 struct brw_reg dst,
2408 struct brw_reg payload,
2409 unsigned binding_table_index,
2410 unsigned msg_length,
2411 unsigned response_length,
2412 bool per_sample)
2413 {
2414 const struct gen_device_info *devinfo = p->devinfo;
2415 assert(devinfo->gen >= 9);
2416 const unsigned msg_subtype =
2417 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2418 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2419
2420 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2421 brw_set_dest(p, insn, dst);
2422 brw_set_src0(p, insn, payload);
2423 brw_set_desc(
2424 p, insn,
2425 brw_message_desc(devinfo, msg_length, response_length, true) |
2426 brw_dp_read_desc(devinfo, binding_table_index,
2427 per_sample << 5 | msg_subtype,
2428 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2429 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2430 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2431
2432 return insn;
2433 }
2434
2435 /**
2436 * Texture sample instruction.
2437 * Note: the msg_type plus msg_length values determine exactly what kind
2438 * of sampling operation is performed. See volume 4, page 161 of docs.
2439 */
2440 void brw_SAMPLE(struct brw_codegen *p,
2441 struct brw_reg dest,
2442 unsigned msg_reg_nr,
2443 struct brw_reg src0,
2444 unsigned binding_table_index,
2445 unsigned sampler,
2446 unsigned msg_type,
2447 unsigned response_length,
2448 unsigned msg_length,
2449 unsigned header_present,
2450 unsigned simd_mode,
2451 unsigned return_format)
2452 {
2453 const struct gen_device_info *devinfo = p->devinfo;
2454 brw_inst *insn;
2455
2456 if (msg_reg_nr != -1)
2457 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2458
2459 insn = next_insn(p, BRW_OPCODE_SEND);
2460 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2461 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2462
2463 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2464 *
2465 * "Instruction compression is not allowed for this instruction (that
2466 * is, send). The hardware behavior is undefined if this instruction is
2467 * set as compressed. However, compress control can be set to "SecHalf"
2468 * to affect the EMask generation."
2469 *
2470 * No similar wording is found in later PRMs, but there are examples
2471 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2472 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2473 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2474 */
2475 brw_inst_set_compression(devinfo, insn, false);
2476
2477 if (devinfo->gen < 6)
2478 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2479
2480 brw_set_dest(p, insn, dest);
2481 brw_set_src0(p, insn, src0);
2482 brw_set_desc(p, insn,
2483 brw_message_desc(devinfo, msg_length, response_length,
2484 header_present) |
2485 brw_sampler_desc(devinfo, binding_table_index, sampler,
2486 msg_type, simd_mode, return_format));
2487 }
2488
2489 /* Adjust the message header's sampler state pointer to
2490 * select the correct group of 16 samplers.
2491 */
2492 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2493 struct brw_reg header,
2494 struct brw_reg sampler_index)
2495 {
2496 /* The "Sampler Index" field can only store values between 0 and 15.
2497 * However, we can add an offset to the "Sampler State Pointer"
2498 * field, effectively selecting a different set of 16 samplers.
2499 *
2500 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2501 * offset, and each sampler state is only 16-bytes, so we can't
2502 * exclusively use the offset - we have to use both.
2503 */
2504
2505 const struct gen_device_info *devinfo = p->devinfo;
2506
2507 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2508 const int sampler_state_size = 16; /* 16 bytes */
2509 uint32_t sampler = sampler_index.ud;
2510
2511 if (sampler >= 16) {
2512 assert(devinfo->is_haswell || devinfo->gen >= 8);
2513 brw_ADD(p,
2514 get_element_ud(header, 3),
2515 get_element_ud(brw_vec8_grf(0, 0), 3),
2516 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2517 }
2518 } else {
2519 /* Non-const sampler array indexing case */
2520 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2521 return;
2522 }
2523
2524 struct brw_reg temp = get_element_ud(header, 3);
2525
2526 brw_push_insn_state(p);
2527 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2528 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2529 brw_SHL(p, temp, temp, brw_imm_ud(4));
2530 brw_ADD(p,
2531 get_element_ud(header, 3),
2532 get_element_ud(brw_vec8_grf(0, 0), 3),
2533 temp);
2534 brw_pop_insn_state(p);
2535 }
2536 }
2537
2538 /* All these variables are pretty confusing - we might be better off
2539 * using bitmasks and macros for this, in the old style. Or perhaps
2540 * just having the caller instantiate the fields in dword3 itself.
2541 */
2542 void brw_urb_WRITE(struct brw_codegen *p,
2543 struct brw_reg dest,
2544 unsigned msg_reg_nr,
2545 struct brw_reg src0,
2546 enum brw_urb_write_flags flags,
2547 unsigned msg_length,
2548 unsigned response_length,
2549 unsigned offset,
2550 unsigned swizzle)
2551 {
2552 const struct gen_device_info *devinfo = p->devinfo;
2553 brw_inst *insn;
2554
2555 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2556
2557 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2558 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2559 brw_push_insn_state(p);
2560 brw_set_default_access_mode(p, BRW_ALIGN_1);
2561 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2562 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2563 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2564 BRW_REGISTER_TYPE_UD),
2565 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2566 brw_imm_ud(0xff00));
2567 brw_pop_insn_state(p);
2568 }
2569
2570 insn = next_insn(p, BRW_OPCODE_SEND);
2571
2572 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2573
2574 brw_set_dest(p, insn, dest);
2575 brw_set_src0(p, insn, src0);
2576 brw_set_src1(p, insn, brw_imm_d(0));
2577
2578 if (devinfo->gen < 6)
2579 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2580
2581 brw_set_urb_message(p,
2582 insn,
2583 flags,
2584 msg_length,
2585 response_length,
2586 offset,
2587 swizzle);
2588 }
2589
2590 void
2591 brw_send_indirect_message(struct brw_codegen *p,
2592 unsigned sfid,
2593 struct brw_reg dst,
2594 struct brw_reg payload,
2595 struct brw_reg desc,
2596 unsigned desc_imm,
2597 bool eot)
2598 {
2599 const struct gen_device_info *devinfo = p->devinfo;
2600 struct brw_inst *send;
2601
2602 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2603
2604 assert(desc.type == BRW_REGISTER_TYPE_UD);
2605
2606 if (desc.file == BRW_IMMEDIATE_VALUE) {
2607 send = next_insn(p, BRW_OPCODE_SEND);
2608 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2609 brw_set_desc(p, send, desc.ud | desc_imm);
2610 } else {
2611 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2612 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2613
2614 brw_push_insn_state(p);
2615 brw_set_default_access_mode(p, BRW_ALIGN_1);
2616 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2617 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2618 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2619 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2620
2621 /* Load the indirect descriptor to an address register using OR so the
2622 * caller can specify additional descriptor bits with the desc_imm
2623 * immediate.
2624 */
2625 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2626
2627 brw_pop_insn_state(p);
2628
2629 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2630 send = next_insn(p, BRW_OPCODE_SEND);
2631 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2632
2633 if (devinfo->gen >= 12)
2634 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2635 else
2636 brw_set_src1(p, send, addr);
2637 }
2638
2639 brw_set_dest(p, send, dst);
2640 brw_inst_set_sfid(devinfo, send, sfid);
2641 brw_inst_set_eot(devinfo, send, eot);
2642 }
2643
2644 void
2645 brw_send_indirect_split_message(struct brw_codegen *p,
2646 unsigned sfid,
2647 struct brw_reg dst,
2648 struct brw_reg payload0,
2649 struct brw_reg payload1,
2650 struct brw_reg desc,
2651 unsigned desc_imm,
2652 struct brw_reg ex_desc,
2653 unsigned ex_desc_imm,
2654 bool eot)
2655 {
2656 const struct gen_device_info *devinfo = p->devinfo;
2657 struct brw_inst *send;
2658
2659 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2660
2661 assert(desc.type == BRW_REGISTER_TYPE_UD);
2662
2663 if (desc.file == BRW_IMMEDIATE_VALUE) {
2664 desc.ud |= desc_imm;
2665 } else {
2666 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2667 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2668
2669 brw_push_insn_state(p);
2670 brw_set_default_access_mode(p, BRW_ALIGN_1);
2671 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2672 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2673 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2674 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2675
2676 /* Load the indirect descriptor to an address register using OR so the
2677 * caller can specify additional descriptor bits with the desc_imm
2678 * immediate.
2679 */
2680 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2681
2682 brw_pop_insn_state(p);
2683 desc = addr;
2684
2685 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2686 }
2687
2688 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2689 (devinfo->gen >= 12 || (ex_desc.ud & INTEL_MASK(15, 12)) == 0)) {
2690 ex_desc.ud |= ex_desc_imm;
2691 } else {
2692 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2693 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2694
2695 brw_push_insn_state(p);
2696 brw_set_default_access_mode(p, BRW_ALIGN_1);
2697 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2698 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2699 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2700 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2701
2702 /* Load the indirect extended descriptor to an address register using OR
2703 * so the caller can specify additional descriptor bits with the
2704 * desc_imm immediate.
2705 *
2706 * Even though the instruction dispatcher always pulls the SFID and EOT
2707 * fields from the instruction itself, actual external unit which
2708 * processes the message gets the SFID and EOT from the extended
2709 * descriptor which comes from the address register. If we don't OR
2710 * those two bits in, the external unit may get confused and hang.
2711 */
2712 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2713
2714 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2715 /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2716 * to Gen12, so we may have fallen back to an indirect extended
2717 * descriptor.
2718 */
2719 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2720 } else {
2721 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2722 }
2723
2724 brw_pop_insn_state(p);
2725 ex_desc = addr;
2726
2727 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2728 }
2729
2730 send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2731 brw_set_dest(p, send, dst);
2732 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2733 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2734
2735 if (desc.file == BRW_IMMEDIATE_VALUE) {
2736 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2737 brw_inst_set_send_desc(devinfo, send, desc.ud);
2738 } else {
2739 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2740 assert(desc.nr == BRW_ARF_ADDRESS);
2741 assert(desc.subnr == 0);
2742 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2743 }
2744
2745 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2746 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2747 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2748 } else {
2749 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2750 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2751 assert((ex_desc.subnr & 0x3) == 0);
2752 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2753 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2754 }
2755
2756 brw_inst_set_sfid(devinfo, send, sfid);
2757 brw_inst_set_eot(devinfo, send, eot);
2758 }
2759
2760 static void
2761 brw_send_indirect_surface_message(struct brw_codegen *p,
2762 unsigned sfid,
2763 struct brw_reg dst,
2764 struct brw_reg payload,
2765 struct brw_reg surface,
2766 unsigned desc_imm)
2767 {
2768 if (surface.file != BRW_IMMEDIATE_VALUE) {
2769 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2770 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2771
2772 brw_push_insn_state(p);
2773 brw_set_default_access_mode(p, BRW_ALIGN_1);
2774 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2775 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2776 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2777 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2778
2779 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2780 * some surface array is accessed out of bounds.
2781 */
2782 brw_AND(p, addr,
2783 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2784 BRW_GET_SWZ(surface.swizzle, 0)),
2785 brw_imm_ud(0xff));
2786
2787 brw_pop_insn_state(p);
2788
2789 surface = addr;
2790 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2791 }
2792
2793 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2794 }
2795
2796 static bool
2797 while_jumps_before_offset(const struct gen_device_info *devinfo,
2798 brw_inst *insn, int while_offset, int start_offset)
2799 {
2800 int scale = 16 / brw_jump_scale(devinfo);
2801 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2802 : brw_inst_jip(devinfo, insn);
2803 assert(jip < 0);
2804 return while_offset + jip * scale <= start_offset;
2805 }
2806
2807
2808 static int
2809 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2810 {
2811 int offset;
2812 void *store = p->store;
2813 const struct gen_device_info *devinfo = p->devinfo;
2814
2815 int depth = 0;
2816
2817 for (offset = next_offset(devinfo, store, start_offset);
2818 offset < p->next_insn_offset;
2819 offset = next_offset(devinfo, store, offset)) {
2820 brw_inst *insn = store + offset;
2821
2822 switch (brw_inst_opcode(devinfo, insn)) {
2823 case BRW_OPCODE_IF:
2824 depth++;
2825 break;
2826 case BRW_OPCODE_ENDIF:
2827 if (depth == 0)
2828 return offset;
2829 depth--;
2830 break;
2831 case BRW_OPCODE_WHILE:
2832 /* If the while doesn't jump before our instruction, it's the end
2833 * of a sibling do...while loop. Ignore it.
2834 */
2835 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2836 continue;
2837 /* fallthrough */
2838 case BRW_OPCODE_ELSE:
2839 case BRW_OPCODE_HALT:
2840 if (depth == 0)
2841 return offset;
2842 default:
2843 break;
2844 }
2845 }
2846
2847 return 0;
2848 }
2849
2850 /* There is no DO instruction on gen6, so to find the end of the loop
2851 * we have to see if the loop is jumping back before our start
2852 * instruction.
2853 */
2854 static int
2855 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2856 {
2857 const struct gen_device_info *devinfo = p->devinfo;
2858 int offset;
2859 void *store = p->store;
2860
2861 assert(devinfo->gen >= 6);
2862
2863 /* Always start after the instruction (such as a WHILE) we're trying to fix
2864 * up.
2865 */
2866 for (offset = next_offset(devinfo, store, start_offset);
2867 offset < p->next_insn_offset;
2868 offset = next_offset(devinfo, store, offset)) {
2869 brw_inst *insn = store + offset;
2870
2871 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2872 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2873 return offset;
2874 }
2875 }
2876 assert(!"not reached");
2877 return start_offset;
2878 }
2879
2880 /* After program generation, go back and update the UIP and JIP of
2881 * BREAK, CONT, and HALT instructions to their correct locations.
2882 */
2883 void
2884 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2885 {
2886 const struct gen_device_info *devinfo = p->devinfo;
2887 int offset;
2888 int br = brw_jump_scale(devinfo);
2889 int scale = 16 / br;
2890 void *store = p->store;
2891
2892 if (devinfo->gen < 6)
2893 return;
2894
2895 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2896 brw_inst *insn = store + offset;
2897 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2898
2899 int block_end_offset = brw_find_next_block_end(p, offset);
2900 switch (brw_inst_opcode(devinfo, insn)) {
2901 case BRW_OPCODE_BREAK:
2902 assert(block_end_offset != 0);
2903 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2904 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2905 brw_inst_set_uip(devinfo, insn,
2906 (brw_find_loop_end(p, offset) - offset +
2907 (devinfo->gen == 6 ? 16 : 0)) / scale);
2908 break;
2909 case BRW_OPCODE_CONTINUE:
2910 assert(block_end_offset != 0);
2911 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2912 brw_inst_set_uip(devinfo, insn,
2913 (brw_find_loop_end(p, offset) - offset) / scale);
2914
2915 assert(brw_inst_uip(devinfo, insn) != 0);
2916 assert(brw_inst_jip(devinfo, insn) != 0);
2917 break;
2918
2919 case BRW_OPCODE_ENDIF: {
2920 int32_t jump = (block_end_offset == 0) ?
2921 1 * br : (block_end_offset - offset) / scale;
2922 if (devinfo->gen >= 7)
2923 brw_inst_set_jip(devinfo, insn, jump);
2924 else
2925 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2926 break;
2927 }
2928
2929 case BRW_OPCODE_HALT:
2930 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2931 *
2932 * "In case of the halt instruction not inside any conditional
2933 * code block, the value of <JIP> and <UIP> should be the
2934 * same. In case of the halt instruction inside conditional code
2935 * block, the <UIP> should be the end of the program, and the
2936 * <JIP> should be end of the most inner conditional code block."
2937 *
2938 * The uip will have already been set by whoever set up the
2939 * instruction.
2940 */
2941 if (block_end_offset == 0) {
2942 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2943 } else {
2944 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2945 }
2946 assert(brw_inst_uip(devinfo, insn) != 0);
2947 assert(brw_inst_jip(devinfo, insn) != 0);
2948 break;
2949
2950 default:
2951 break;
2952 }
2953 }
2954 }
2955
2956 void brw_ff_sync(struct brw_codegen *p,
2957 struct brw_reg dest,
2958 unsigned msg_reg_nr,
2959 struct brw_reg src0,
2960 bool allocate,
2961 unsigned response_length,
2962 bool eot)
2963 {
2964 const struct gen_device_info *devinfo = p->devinfo;
2965 brw_inst *insn;
2966
2967 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2968
2969 insn = next_insn(p, BRW_OPCODE_SEND);
2970 brw_set_dest(p, insn, dest);
2971 brw_set_src0(p, insn, src0);
2972 brw_set_src1(p, insn, brw_imm_d(0));
2973
2974 if (devinfo->gen < 6)
2975 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2976
2977 brw_set_ff_sync_message(p,
2978 insn,
2979 allocate,
2980 response_length,
2981 eot);
2982 }
2983
2984 /**
2985 * Emit the SEND instruction necessary to generate stream output data on Gen6
2986 * (for transform feedback).
2987 *
2988 * If send_commit_msg is true, this is the last piece of stream output data
2989 * from this thread, so send the data as a committed write. According to the
2990 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2991 *
2992 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2993 * writes are complete by sending the final write as a committed write."
2994 */
2995 void
2996 brw_svb_write(struct brw_codegen *p,
2997 struct brw_reg dest,
2998 unsigned msg_reg_nr,
2999 struct brw_reg src0,
3000 unsigned binding_table_index,
3001 bool send_commit_msg)
3002 {
3003 const struct gen_device_info *devinfo = p->devinfo;
3004 const unsigned target_cache =
3005 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
3006 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
3007 BRW_SFID_DATAPORT_WRITE);
3008 brw_inst *insn;
3009
3010 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
3011
3012 insn = next_insn(p, BRW_OPCODE_SEND);
3013 brw_inst_set_sfid(devinfo, insn, target_cache);
3014 brw_set_dest(p, insn, dest);
3015 brw_set_src0(p, insn, src0);
3016 brw_set_desc(p, insn,
3017 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3018 brw_dp_write_desc(devinfo, binding_table_index,
3019 0, /* msg_control: ignored */
3020 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3021 0, /* last_render_target: ignored */
3022 send_commit_msg)); /* send_commit_msg */
3023 }
3024
3025 static unsigned
3026 brw_surface_payload_size(struct brw_codegen *p,
3027 unsigned num_channels,
3028 unsigned exec_size /**< 0 for SIMD4x2 */)
3029 {
3030 if (exec_size == 0)
3031 return 1; /* SIMD4x2 */
3032 else if (exec_size <= 8)
3033 return num_channels;
3034 else
3035 return 2 * num_channels;
3036 }
3037
3038 void
3039 brw_untyped_atomic(struct brw_codegen *p,
3040 struct brw_reg dst,
3041 struct brw_reg payload,
3042 struct brw_reg surface,
3043 unsigned atomic_op,
3044 unsigned msg_length,
3045 bool response_expected,
3046 bool header_present)
3047 {
3048 const struct gen_device_info *devinfo = p->devinfo;
3049 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3050 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3051 GEN7_SFID_DATAPORT_DATA_CACHE);
3052 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3053 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3054 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3055 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3056 has_simd4x2 ? 0 : 8;
3057 const unsigned response_length =
3058 brw_surface_payload_size(p, response_expected, exec_size);
3059 const unsigned desc =
3060 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3061 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3062 response_expected);
3063 /* Mask out unused components -- This is especially important in Align16
3064 * mode on generations that don't have native support for SIMD4x2 atomics,
3065 * because unused but enabled components will cause the dataport to perform
3066 * additional atomic operations on the addresses that happen to be in the
3067 * uninitialized Y, Z and W coordinates of the payload.
3068 */
3069 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3070
3071 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3072 payload, surface, desc);
3073 }
3074
3075 void
3076 brw_untyped_surface_read(struct brw_codegen *p,
3077 struct brw_reg dst,
3078 struct brw_reg payload,
3079 struct brw_reg surface,
3080 unsigned msg_length,
3081 unsigned num_channels)
3082 {
3083 const struct gen_device_info *devinfo = p->devinfo;
3084 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3085 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3086 GEN7_SFID_DATAPORT_DATA_CACHE);
3087 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3088 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3089 const unsigned response_length =
3090 brw_surface_payload_size(p, num_channels, exec_size);
3091 const unsigned desc =
3092 brw_message_desc(devinfo, msg_length, response_length, false) |
3093 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3094
3095 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3096 }
3097
3098 void
3099 brw_untyped_surface_write(struct brw_codegen *p,
3100 struct brw_reg payload,
3101 struct brw_reg surface,
3102 unsigned msg_length,
3103 unsigned num_channels,
3104 bool header_present)
3105 {
3106 const struct gen_device_info *devinfo = p->devinfo;
3107 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3108 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3109 GEN7_SFID_DATAPORT_DATA_CACHE);
3110 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3111 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3112 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3113 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3114 has_simd4x2 ? 0 : 8;
3115 const unsigned desc =
3116 brw_message_desc(devinfo, msg_length, 0, header_present) |
3117 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3118 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3119 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3120
3121 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3122 payload, surface, desc);
3123 }
3124
3125 static void
3126 brw_set_memory_fence_message(struct brw_codegen *p,
3127 struct brw_inst *insn,
3128 enum brw_message_target sfid,
3129 bool commit_enable,
3130 unsigned bti)
3131 {
3132 const struct gen_device_info *devinfo = p->devinfo;
3133
3134 brw_set_desc(p, insn, brw_message_desc(
3135 devinfo, 1, (commit_enable ? 1 : 0), true));
3136
3137 brw_inst_set_sfid(devinfo, insn, sfid);
3138
3139 switch (sfid) {
3140 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3141 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3142 break;
3143 case GEN7_SFID_DATAPORT_DATA_CACHE:
3144 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3145 break;
3146 default:
3147 unreachable("Not reached");
3148 }
3149
3150 if (commit_enable)
3151 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3152
3153 assert(devinfo->gen >= 11 || bti == 0);
3154 brw_inst_set_binding_table_index(devinfo, insn, bti);
3155 }
3156
3157 void
3158 brw_memory_fence(struct brw_codegen *p,
3159 struct brw_reg dst,
3160 struct brw_reg src,
3161 enum opcode send_op,
3162 enum brw_message_target sfid,
3163 bool commit_enable,
3164 unsigned bti)
3165 {
3166 const struct gen_device_info *devinfo = p->devinfo;
3167
3168 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3169 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3170
3171 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3172 * message doesn't write anything back.
3173 */
3174 struct brw_inst *insn = next_insn(p, send_op);
3175 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3176 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3177 brw_set_dest(p, insn, dst);
3178 brw_set_src0(p, insn, src);
3179 brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3180 }
3181
3182 void
3183 brw_pixel_interpolator_query(struct brw_codegen *p,
3184 struct brw_reg dest,
3185 struct brw_reg mrf,
3186 bool noperspective,
3187 unsigned mode,
3188 struct brw_reg data,
3189 unsigned msg_length,
3190 unsigned response_length)
3191 {
3192 const struct gen_device_info *devinfo = p->devinfo;
3193 const uint16_t exec_size = brw_get_default_exec_size(p);
3194 const unsigned slot_group = brw_get_default_group(p) / 16;
3195 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3196 const unsigned desc =
3197 brw_message_desc(devinfo, msg_length, response_length, false) |
3198 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3199 slot_group);
3200
3201 /* brw_send_indirect_message will automatically use a direct send message
3202 * if data is actually immediate.
3203 */
3204 brw_send_indirect_message(p,
3205 GEN7_SFID_PIXEL_INTERPOLATOR,
3206 dest,
3207 mrf,
3208 vec1(data),
3209 desc,
3210 false);
3211 }
3212
3213 void
3214 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3215 struct brw_reg mask)
3216 {
3217 const struct gen_device_info *devinfo = p->devinfo;
3218 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3219 const unsigned qtr_control = brw_get_default_group(p) / 8;
3220 brw_inst *inst;
3221
3222 assert(devinfo->gen >= 7);
3223 assert(mask.type == BRW_REGISTER_TYPE_UD);
3224
3225 brw_push_insn_state(p);
3226
3227 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3228 * unnecessary bits in the instruction words, get the information we need
3229 * and reset the default flag register. This allows more instructions to be
3230 * compacted.
3231 */
3232 const unsigned flag_subreg = p->current->flag_subreg;
3233 brw_set_default_flag_reg(p, 0, 0);
3234
3235 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3236 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3237
3238 if (devinfo->gen >= 8) {
3239 /* Getting the first active channel index is easy on Gen8: Just find
3240 * the first bit set in the execution mask. The register exists on
3241 * HSW already but it reads back as all ones when the current
3242 * instruction has execution masking disabled, so it's kind of
3243 * useless.
3244 */
3245 struct brw_reg exec_mask =
3246 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3247
3248 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3249 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3250 /* Unfortunately, ce0 does not take into account the thread
3251 * dispatch mask, which may be a problem in cases where it's not
3252 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3253 * some n). Combine ce0 with the given dispatch (or vector) mask
3254 * to mask off those channels which were never dispatched by the
3255 * hardware.
3256 */
3257 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3258 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3259 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3260 exec_mask = vec1(dst);
3261 }
3262
3263 /* Quarter control has the effect of magically shifting the value of
3264 * ce0 so you'll get the first active channel relative to the
3265 * specified quarter control as result.
3266 */
3267 inst = brw_FBL(p, vec1(dst), exec_mask);
3268 } else {
3269 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3270
3271 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3272 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3273
3274 /* Run enough instructions returning zero with execution masking and
3275 * a conditional modifier enabled in order to get the full execution
3276 * mask in f1.0. We could use a single 32-wide move here if it
3277 * weren't because of the hardware bug that causes channel enables to
3278 * be applied incorrectly to the second half of 32-wide instructions
3279 * on Gen7.
3280 */
3281 const unsigned lower_size = MIN2(16, exec_size);
3282 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3283 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3284 brw_imm_uw(0));
3285 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3286 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3287 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3288 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3289 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3290 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3291 }
3292
3293 /* Find the first bit set in the exec_size-wide portion of the flag
3294 * register that was updated by the last sequence of MOV
3295 * instructions.
3296 */
3297 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3298 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3299 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3300 }
3301 } else {
3302 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3303
3304 if (devinfo->gen >= 8 &&
3305 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3306 /* In SIMD4x2 mode the first active channel index is just the
3307 * negation of the first bit of the mask register. Note that ce0
3308 * doesn't take into account the dispatch mask, so the Gen7 path
3309 * should be used instead unless you have the guarantee that the
3310 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3311 * for some n).
3312 */
3313 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3314 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3315 brw_imm_ud(1));
3316
3317 } else {
3318 /* Overwrite the destination without and with execution masking to
3319 * find out which of the channels is active.
3320 */
3321 brw_push_insn_state(p);
3322 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3323 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3324 brw_imm_ud(1));
3325
3326 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3327 brw_imm_ud(0));
3328 brw_pop_insn_state(p);
3329 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3330 }
3331 }
3332
3333 brw_pop_insn_state(p);
3334 }
3335
3336 void
3337 brw_broadcast(struct brw_codegen *p,
3338 struct brw_reg dst,
3339 struct brw_reg src,
3340 struct brw_reg idx)
3341 {
3342 const struct gen_device_info *devinfo = p->devinfo;
3343 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3344 brw_inst *inst;
3345
3346 brw_push_insn_state(p);
3347 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3348 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3349
3350 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3351 src.address_mode == BRW_ADDRESS_DIRECT);
3352 assert(!src.abs && !src.negate);
3353 assert(src.type == dst.type);
3354
3355 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3356 idx.file == BRW_IMMEDIATE_VALUE) {
3357 /* Trivial, the source is already uniform or the index is a constant.
3358 * We will typically not get here if the optimizer is doing its job, but
3359 * asserting would be mean.
3360 */
3361 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3362 brw_MOV(p, dst,
3363 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3364 stride(suboffset(src, 4 * i), 0, 4, 1)));
3365 } else {
3366 /* From the Haswell PRM section "Register Region Restrictions":
3367 *
3368 * "The lower bits of the AddressImmediate must not overflow to
3369 * change the register address. The lower 5 bits of Address
3370 * Immediate when added to lower 5 bits of address register gives
3371 * the sub-register offset. The upper bits of Address Immediate
3372 * when added to upper bits of address register gives the register
3373 * address. Any overflow from sub-register offset is dropped."
3374 *
3375 * Fortunately, for broadcast, we never have a sub-register offset so
3376 * this isn't an issue.
3377 */
3378 assert(src.subnr == 0);
3379
3380 if (align1) {
3381 const struct brw_reg addr =
3382 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3383 unsigned offset = src.nr * REG_SIZE + src.subnr;
3384 /* Limit in bytes of the signed indirect addressing immediate. */
3385 const unsigned limit = 512;
3386
3387 brw_push_insn_state(p);
3388 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3389 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3390
3391 /* Take into account the component size and horizontal stride. */
3392 assert(src.vstride == src.hstride + src.width);
3393 brw_SHL(p, addr, vec1(idx),
3394 brw_imm_ud(util_logbase2(type_sz(src.type)) +
3395 src.hstride - 1));
3396
3397 /* We can only address up to limit bytes using the indirect
3398 * addressing immediate, account for the difference if the source
3399 * register is above this limit.
3400 */
3401 if (offset >= limit) {
3402 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3403 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3404 offset = offset % limit;
3405 }
3406
3407 brw_pop_insn_state(p);
3408
3409 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3410
3411 /* Use indirect addressing to fetch the specified component. */
3412 if (type_sz(src.type) > 4 &&
3413 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3414 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3415 *
3416 * "When source or destination datatype is 64b or operation is
3417 * integer DWord multiply, indirect addressing must not be
3418 * used."
3419 *
3420 * To work around both of this issue, we do two integer MOVs
3421 * insead of one 64-bit MOV. Because no double value should ever
3422 * cross a register boundary, it's safe to use the immediate
3423 * offset in the indirect here to handle adding 4 bytes to the
3424 * offset and avoid the extra ADD to the register file.
3425 */
3426 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3427 retype(brw_vec1_indirect(addr.subnr, offset),
3428 BRW_REGISTER_TYPE_D));
3429 brw_set_default_swsb(p, tgl_swsb_null());
3430 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3431 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3432 BRW_REGISTER_TYPE_D));
3433 } else {
3434 brw_MOV(p, dst,
3435 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3436 }
3437 } else {
3438 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3439 * to all bits of a flag register,
3440 */
3441 inst = brw_MOV(p,
3442 brw_null_reg(),
3443 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3444 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3445 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3446 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3447
3448 /* and use predicated SEL to pick the right channel. */
3449 inst = brw_SEL(p, dst,
3450 stride(suboffset(src, 4), 4, 4, 1),
3451 stride(src, 4, 4, 1));
3452 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3453 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3454 }
3455 }
3456
3457 brw_pop_insn_state(p);
3458 }
3459
3460 /**
3461 * This instruction is generated as a single-channel align1 instruction by
3462 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3463 *
3464 * We can't use the typed atomic op in the FS because that has the execution
3465 * mask ANDed with the pixel mask, but we just want to write the one dword for
3466 * all the pixels.
3467 *
3468 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3469 * one u32. So we use the same untyped atomic write message as the pixel
3470 * shader.
3471 *
3472 * The untyped atomic operation requires a BUFFER surface type with RAW
3473 * format, and is only accessible through the legacy DATA_CACHE dataport
3474 * messages.
3475 */
3476 void brw_shader_time_add(struct brw_codegen *p,
3477 struct brw_reg payload,
3478 uint32_t surf_index)
3479 {
3480 const struct gen_device_info *devinfo = p->devinfo;
3481 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3482 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3483 GEN7_SFID_DATAPORT_DATA_CACHE);
3484 assert(devinfo->gen >= 7);
3485
3486 brw_push_insn_state(p);
3487 brw_set_default_access_mode(p, BRW_ALIGN_1);
3488 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3489 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3490 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3491
3492 /* We use brw_vec1_reg and unmasked because we want to increment the given
3493 * offset only once.
3494 */
3495 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3496 BRW_ARF_NULL, 0));
3497 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3498 payload.nr, 0));
3499 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3500 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3501 false)));
3502
3503 brw_inst_set_sfid(devinfo, send, sfid);
3504 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3505
3506 brw_pop_insn_state(p);
3507 }
3508
3509
3510 /**
3511 * Emit the SEND message for a barrier
3512 */
3513 void
3514 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3515 {
3516 const struct gen_device_info *devinfo = p->devinfo;
3517 struct brw_inst *inst;
3518
3519 assert(devinfo->gen >= 7);
3520
3521 brw_push_insn_state(p);
3522 brw_set_default_access_mode(p, BRW_ALIGN_1);
3523 inst = next_insn(p, BRW_OPCODE_SEND);
3524 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3525 brw_set_src0(p, inst, src);
3526 brw_set_src1(p, inst, brw_null_reg());
3527 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3528
3529 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3530 brw_inst_set_gateway_subfuncid(devinfo, inst,
3531 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3532
3533 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3534 brw_pop_insn_state(p);
3535 }
3536
3537
3538 /**
3539 * Emit the wait instruction for a barrier
3540 */
3541 void
3542 brw_WAIT(struct brw_codegen *p)
3543 {
3544 const struct gen_device_info *devinfo = p->devinfo;
3545 struct brw_inst *insn;
3546
3547 struct brw_reg src = brw_notification_reg();
3548
3549 insn = next_insn(p, BRW_OPCODE_WAIT);
3550 brw_set_dest(p, insn, src);
3551 brw_set_src0(p, insn, src);
3552 brw_set_src1(p, insn, brw_null_reg());
3553
3554 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3555 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3556 }
3557
3558 void
3559 brw_float_controls_mode(struct brw_codegen *p,
3560 unsigned mode, unsigned mask)
3561 {
3562 /* From the Skylake PRM, Volume 7, page 760:
3563 * "Implementation Restriction on Register Access: When the control
3564 * register is used as an explicit source and/or destination, hardware
3565 * does not ensure execution pipeline coherency. Software must set the
3566 * thread control field to ‘switch’ for an instruction that uses
3567 * control register as an explicit operand."
3568 *
3569 * On Gen12+ this is implemented in terms of SWSB annotations instead.
3570 */
3571 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3572
3573 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3574 brw_imm_ud(~mask));
3575 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3576 if (p->devinfo->gen < 12)
3577 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3578
3579 if (mode) {
3580 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3581 brw_imm_ud(mode));
3582 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3583 if (p->devinfo->gen < 12)
3584 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3585 }
3586
3587 if (p->devinfo->gen >= 12)
3588 brw_SYNC(p, TGL_SYNC_NOP);
3589 }