intel/eu/gen12: Add tracking of default SWSB state to the current brw_codegen instruc...
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 /* The hardware has a restriction where if the destination is Byte,
98 * the instruction needs to have a stride of 2 (except for packed byte
99 * MOV). This seems to be required even if the destination is the NULL
100 * register.
101 */
102 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
103 dest.nr == BRW_ARF_NULL &&
104 type_sz(dest.type) == 1) {
105 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
106 }
107
108 gen7_convert_mrf_to_grf(p, &dest);
109
110 if (devinfo->gen >= 12 &&
111 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
112 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
113 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
115 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
116 assert(dest.subnr == 0);
117 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
118 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
119 dest.vstride == dest.width + 1));
120 assert(!dest.negate && !dest.abs);
121 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
122 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
123
124 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
125 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
126 assert(devinfo->gen < 12);
127 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
128 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
129 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
130 assert(dest.subnr % 16 == 0);
131 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
132 dest.vstride == dest.width + 1);
133 assert(!dest.negate && !dest.abs);
134 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
135 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
136 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
137 } else {
138 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
139 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
140
141 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
142 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
143
144 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
145 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
146 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
147 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
148 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
149 } else {
150 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
151 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
152 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
153 dest.file == BRW_MESSAGE_REGISTER_FILE) {
154 assert(dest.writemask != 0);
155 }
156 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
157 * Although Dst.HorzStride is a don't care for Align16, HW needs
158 * this to be programmed as "01".
159 */
160 brw_inst_set_dst_hstride(devinfo, inst, 1);
161 }
162 } else {
163 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
164
165 /* These are different sizes in align1 vs align16:
166 */
167 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
168 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
169 dest.indirect_offset);
170 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
171 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
172 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
173 } else {
174 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
175 dest.indirect_offset);
176 /* even ignored in da16, still need to set as '01' */
177 brw_inst_set_dst_hstride(devinfo, inst, 1);
178 }
179 }
180 }
181
182 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
183 * or 16 (SIMD16), as that's normally correct. However, when dealing with
184 * small registers, it can be useful for us to automatically reduce it to
185 * match the register size.
186 */
187 if (p->automatic_exec_sizes) {
188 /*
189 * In platforms that support fp64 we can emit instructions with a width
190 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
191 * these cases we need to make sure that these instructions have their
192 * exec sizes set properly when they are emitted and we can't rely on
193 * this code to fix it.
194 */
195 bool fix_exec_size;
196 if (devinfo->gen >= 6)
197 fix_exec_size = dest.width < BRW_EXECUTE_4;
198 else
199 fix_exec_size = dest.width < BRW_EXECUTE_8;
200
201 if (fix_exec_size)
202 brw_inst_set_exec_size(devinfo, inst, dest.width);
203 }
204 }
205
206 void
207 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
208 {
209 const struct gen_device_info *devinfo = p->devinfo;
210
211 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
212 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
213 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
214 assert(reg.nr < 128);
215
216 gen7_convert_mrf_to_grf(p, &reg);
217
218 if (devinfo->gen >= 6 &&
219 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
220 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
221 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
223 /* Any source modifiers or regions will be ignored, since this just
224 * identifies the MRF/GRF to start reading the message contents from.
225 * Check for some likely failures.
226 */
227 assert(!reg.negate);
228 assert(!reg.abs);
229 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
230 }
231
232 if (devinfo->gen >= 12 &&
233 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
234 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
235 assert(reg.file != BRW_IMMEDIATE_VALUE);
236 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
237 assert(reg.subnr == 0);
238 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
239 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
240 reg.vstride == reg.width + 1));
241 assert(!reg.negate && !reg.abs);
242 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
243 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
244
245 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
246 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
247 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
248 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
249 assert(reg.subnr % 16 == 0);
250 assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
251 reg.vstride == reg.width + 1);
252 assert(!reg.negate && !reg.abs);
253 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
254 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
255 } else {
256 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
257 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
258 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
259 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
260
261 if (reg.file == BRW_IMMEDIATE_VALUE) {
262 if (reg.type == BRW_REGISTER_TYPE_DF ||
263 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
264 brw_inst_set_imm_df(devinfo, inst, reg.df);
265 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
266 reg.type == BRW_REGISTER_TYPE_Q)
267 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
268 else
269 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
270
271 if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
272 brw_inst_set_src1_reg_file(devinfo, inst,
273 BRW_ARCHITECTURE_REGISTER_FILE);
274 brw_inst_set_src1_reg_hw_type(devinfo, inst,
275 brw_inst_src0_reg_hw_type(devinfo, inst));
276 }
277 } else {
278 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
279 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
280 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
281 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
282 } else {
283 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
284 }
285 } else {
286 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
287
288 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
289 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
290 } else {
291 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
292 }
293 }
294
295 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
296 if (reg.width == BRW_WIDTH_1 &&
297 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
298 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
299 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
300 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
301 } else {
302 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
303 brw_inst_set_src0_width(devinfo, inst, reg.width);
304 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
305 }
306 } else {
307 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
308 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
309 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
310 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
311 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
312 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
313 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
314 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
315
316 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
317 /* This is an oddity of the fact we're using the same
318 * descriptions for registers in align_16 as align_1:
319 */
320 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
321 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
322 reg.type == BRW_REGISTER_TYPE_DF &&
323 reg.vstride == BRW_VERTICAL_STRIDE_2) {
324 /* From SNB PRM:
325 *
326 * "For Align16 access mode, only encodings of 0000 and 0011
327 * are allowed. Other codes are reserved."
328 *
329 * Presumably the DevSNB behavior applies to IVB as well.
330 */
331 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
332 } else {
333 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
334 }
335 }
336 }
337 }
338 }
339
340
341 void
342 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
343 {
344 const struct gen_device_info *devinfo = p->devinfo;
345
346 if (reg.file == BRW_GENERAL_REGISTER_FILE)
347 assert(reg.nr < 128);
348
349 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
350 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
351 (devinfo->gen >= 12 &&
352 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
353 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
354 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
355 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
356 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
357 assert(reg.subnr == 0);
358 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
359 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
360 reg.vstride == reg.width + 1));
361 assert(!reg.negate && !reg.abs);
362 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
363 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
364 } else {
365 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
366 *
367 * "Accumulator registers may be accessed explicitly as src0
368 * operands only."
369 */
370 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
371 reg.nr != BRW_ARF_ACCUMULATOR);
372
373 gen7_convert_mrf_to_grf(p, &reg);
374 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
375
376 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
377 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
378 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
379
380 /* Only src1 can be immediate in two-argument instructions.
381 */
382 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
383
384 if (reg.file == BRW_IMMEDIATE_VALUE) {
385 /* two-argument instructions can only use 32-bit immediates */
386 assert(type_sz(reg.type) < 8);
387 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
388 } else {
389 /* This is a hardware restriction, which may or may not be lifted
390 * in the future:
391 */
392 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
393 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
394
395 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
396 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
397 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
398 } else {
399 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
400 }
401
402 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
403 if (reg.width == BRW_WIDTH_1 &&
404 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
405 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
406 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
407 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
408 } else {
409 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
410 brw_inst_set_src1_width(devinfo, inst, reg.width);
411 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
412 }
413 } else {
414 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
415 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
416 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
417 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
418 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
419 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
420 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
421 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
422
423 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
424 /* This is an oddity of the fact we're using the same
425 * descriptions for registers in align_16 as align_1:
426 */
427 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
428 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
429 reg.type == BRW_REGISTER_TYPE_DF &&
430 reg.vstride == BRW_VERTICAL_STRIDE_2) {
431 /* From SNB PRM:
432 *
433 * "For Align16 access mode, only encodings of 0000 and 0011
434 * are allowed. Other codes are reserved."
435 *
436 * Presumably the DevSNB behavior applies to IVB as well.
437 */
438 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
439 } else {
440 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
441 }
442 }
443 }
444 }
445 }
446
447 /**
448 * Specify the descriptor and extended descriptor immediate for a SEND(C)
449 * message instruction.
450 */
451 void
452 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
453 unsigned desc, unsigned ex_desc)
454 {
455 const struct gen_device_info *devinfo = p->devinfo;
456 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
457 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
458 if (devinfo->gen < 12)
459 brw_inst_set_src1_file_type(devinfo, inst,
460 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
461 brw_inst_set_send_desc(devinfo, inst, desc);
462 if (devinfo->gen >= 9)
463 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
464 }
465
466 static void brw_set_math_message( struct brw_codegen *p,
467 brw_inst *inst,
468 unsigned function,
469 unsigned integer_type,
470 bool low_precision,
471 unsigned dataType )
472 {
473 const struct gen_device_info *devinfo = p->devinfo;
474 unsigned msg_length;
475 unsigned response_length;
476
477 /* Infer message length from the function */
478 switch (function) {
479 case BRW_MATH_FUNCTION_POW:
480 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
481 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
482 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
483 msg_length = 2;
484 break;
485 default:
486 msg_length = 1;
487 break;
488 }
489
490 /* Infer response length from the function */
491 switch (function) {
492 case BRW_MATH_FUNCTION_SINCOS:
493 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
494 response_length = 2;
495 break;
496 default:
497 response_length = 1;
498 break;
499 }
500
501 brw_set_desc(p, inst, brw_message_desc(
502 devinfo, msg_length, response_length, false));
503
504 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
505 brw_inst_set_math_msg_function(devinfo, inst, function);
506 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
507 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
508 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
509 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
510 brw_inst_set_saturate(devinfo, inst, 0);
511 }
512
513
514 static void brw_set_ff_sync_message(struct brw_codegen *p,
515 brw_inst *insn,
516 bool allocate,
517 unsigned response_length,
518 bool end_of_thread)
519 {
520 const struct gen_device_info *devinfo = p->devinfo;
521
522 brw_set_desc(p, insn, brw_message_desc(
523 devinfo, 1, response_length, true));
524
525 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
526 brw_inst_set_eot(devinfo, insn, end_of_thread);
527 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
528 brw_inst_set_urb_allocate(devinfo, insn, allocate);
529 /* The following fields are not used by FF_SYNC: */
530 brw_inst_set_urb_global_offset(devinfo, insn, 0);
531 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
532 brw_inst_set_urb_used(devinfo, insn, 0);
533 brw_inst_set_urb_complete(devinfo, insn, 0);
534 }
535
536 static void brw_set_urb_message( struct brw_codegen *p,
537 brw_inst *insn,
538 enum brw_urb_write_flags flags,
539 unsigned msg_length,
540 unsigned response_length,
541 unsigned offset,
542 unsigned swizzle_control )
543 {
544 const struct gen_device_info *devinfo = p->devinfo;
545
546 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
547 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
548 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
549
550 brw_set_desc(p, insn, brw_message_desc(
551 devinfo, msg_length, response_length, true));
552
553 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
554 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
555
556 if (flags & BRW_URB_WRITE_OWORD) {
557 assert(msg_length == 2); /* header + one OWORD of data */
558 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
559 } else {
560 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
561 }
562
563 brw_inst_set_urb_global_offset(devinfo, insn, offset);
564 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
565
566 if (devinfo->gen < 8) {
567 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
568 }
569
570 if (devinfo->gen < 7) {
571 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
572 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
573 } else {
574 brw_inst_set_urb_per_slot_offset(devinfo, insn,
575 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
576 }
577 }
578
579 static void
580 gen7_set_dp_scratch_message(struct brw_codegen *p,
581 brw_inst *inst,
582 bool write,
583 bool dword,
584 bool invalidate_after_read,
585 unsigned num_regs,
586 unsigned addr_offset,
587 unsigned mlen,
588 unsigned rlen,
589 bool header_present)
590 {
591 const struct gen_device_info *devinfo = p->devinfo;
592 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
593 (devinfo->gen >= 8 && num_regs == 8));
594 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
595 num_regs - 1);
596
597 brw_set_desc(p, inst, brw_message_desc(
598 devinfo, mlen, rlen, header_present));
599
600 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
601 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
602 brw_inst_set_scratch_read_write(devinfo, inst, write);
603 brw_inst_set_scratch_type(devinfo, inst, dword);
604 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
605 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
606 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
607 }
608
609 static void
610 brw_inst_set_state(const struct gen_device_info *devinfo,
611 brw_inst *insn,
612 const struct brw_insn_state *state)
613 {
614 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
615 brw_inst_set_group(devinfo, insn, state->group);
616 brw_inst_set_compression(devinfo, insn, state->compressed);
617 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
618 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
619 if (devinfo->gen >= 12)
620 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(state->swsb));
621 brw_inst_set_saturate(devinfo, insn, state->saturate);
622 brw_inst_set_pred_control(devinfo, insn, state->predicate);
623 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
624
625 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
626 state->access_mode == BRW_ALIGN_16) {
627 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
628 if (devinfo->gen >= 7)
629 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
630 } else {
631 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
632 if (devinfo->gen >= 7)
633 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
634 }
635
636 if (devinfo->gen >= 6)
637 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
638 }
639
640 #define next_insn brw_next_insn
641 brw_inst *
642 brw_next_insn(struct brw_codegen *p, unsigned opcode)
643 {
644 const struct gen_device_info *devinfo = p->devinfo;
645 brw_inst *insn;
646
647 if (p->nr_insn + 1 > p->store_size) {
648 p->store_size <<= 1;
649 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
650 }
651
652 p->next_insn_offset += 16;
653 insn = &p->store[p->nr_insn++];
654
655 memset(insn, 0, sizeof(*insn));
656 brw_inst_set_opcode(devinfo, insn, opcode);
657
658 /* Apply the default instruction state */
659 brw_inst_set_state(devinfo, insn, p->current);
660
661 return insn;
662 }
663
664 static brw_inst *
665 brw_alu1(struct brw_codegen *p, unsigned opcode,
666 struct brw_reg dest, struct brw_reg src)
667 {
668 brw_inst *insn = next_insn(p, opcode);
669 brw_set_dest(p, insn, dest);
670 brw_set_src0(p, insn, src);
671 return insn;
672 }
673
674 static brw_inst *
675 brw_alu2(struct brw_codegen *p, unsigned opcode,
676 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
677 {
678 /* 64-bit immediates are only supported on 1-src instructions */
679 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
680 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
681
682 brw_inst *insn = next_insn(p, opcode);
683 brw_set_dest(p, insn, dest);
684 brw_set_src0(p, insn, src0);
685 brw_set_src1(p, insn, src1);
686 return insn;
687 }
688
689 static int
690 get_3src_subreg_nr(struct brw_reg reg)
691 {
692 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
693 * use 32-bit units (components 0..7). Since they only support F/D/UD
694 * types, this doesn't lose any flexibility, but uses fewer bits.
695 */
696 return reg.subnr / 4;
697 }
698
699 static enum gen10_align1_3src_vertical_stride
700 to_3src_align1_vstride(const struct gen_device_info *devinfo,
701 enum brw_vertical_stride vstride)
702 {
703 switch (vstride) {
704 case BRW_VERTICAL_STRIDE_0:
705 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
706 case BRW_VERTICAL_STRIDE_1:
707 assert(devinfo->gen >= 12);
708 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
709 case BRW_VERTICAL_STRIDE_2:
710 assert(devinfo->gen < 12);
711 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
712 case BRW_VERTICAL_STRIDE_4:
713 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
714 case BRW_VERTICAL_STRIDE_8:
715 case BRW_VERTICAL_STRIDE_16:
716 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
717 default:
718 unreachable("invalid vstride");
719 }
720 }
721
722
723 static enum gen10_align1_3src_src_horizontal_stride
724 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
725 {
726 switch (hstride) {
727 case BRW_HORIZONTAL_STRIDE_0:
728 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
729 case BRW_HORIZONTAL_STRIDE_1:
730 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
731 case BRW_HORIZONTAL_STRIDE_2:
732 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
733 case BRW_HORIZONTAL_STRIDE_4:
734 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
735 default:
736 unreachable("invalid hstride");
737 }
738 }
739
740 static brw_inst *
741 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
742 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
743 {
744 const struct gen_device_info *devinfo = p->devinfo;
745 brw_inst *inst = next_insn(p, opcode);
746
747 gen7_convert_mrf_to_grf(p, &dest);
748
749 assert(dest.nr < 128);
750 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
751 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
752 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
753 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
754 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
755 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
756 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
757
758 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
759 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
760 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
761
762 if (devinfo->gen >= 12) {
763 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
764 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
765 } else {
766 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
767 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
768 BRW_ALIGN1_3SRC_ACCUMULATOR);
769 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
770 } else {
771 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
772 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
773 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
774 }
775 }
776 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
777
778 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
779
780 if (brw_reg_type_is_floating_point(dest.type)) {
781 brw_inst_set_3src_a1_exec_type(devinfo, inst,
782 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
783 } else {
784 brw_inst_set_3src_a1_exec_type(devinfo, inst,
785 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
786 }
787
788 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
789 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
790 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
791 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
792
793 brw_inst_set_3src_a1_src0_vstride(
794 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
795 brw_inst_set_3src_a1_src1_vstride(
796 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
797 /* no vstride on src2 */
798
799 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
800 to_3src_align1_hstride(src0.hstride));
801 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
802 to_3src_align1_hstride(src1.hstride));
803 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
804 to_3src_align1_hstride(src2.hstride));
805
806 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
807 if (src0.type == BRW_REGISTER_TYPE_NF) {
808 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
809 } else {
810 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
811 }
812 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
813 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
814
815 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
816 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
817 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
818 } else {
819 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
820 }
821 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
822 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
823
824 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
825 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
826 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
827 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
828
829 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
830 src0.file == BRW_IMMEDIATE_VALUE ||
831 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
832 src0.type == BRW_REGISTER_TYPE_NF));
833 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
834 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
835 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
836 src2.file == BRW_IMMEDIATE_VALUE);
837
838 if (devinfo->gen >= 12) {
839 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
840 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
841 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
842 } else {
843 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
844 src0.file == BRW_GENERAL_REGISTER_FILE ?
845 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
846 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
847 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
848 src1.file == BRW_GENERAL_REGISTER_FILE ?
849 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
850 BRW_ALIGN1_3SRC_ACCUMULATOR);
851 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
852 src2.file == BRW_GENERAL_REGISTER_FILE ?
853 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
854 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
855 }
856
857 } else {
858 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
859 dest.file == BRW_MESSAGE_REGISTER_FILE);
860 assert(dest.type == BRW_REGISTER_TYPE_F ||
861 dest.type == BRW_REGISTER_TYPE_DF ||
862 dest.type == BRW_REGISTER_TYPE_D ||
863 dest.type == BRW_REGISTER_TYPE_UD ||
864 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
865 if (devinfo->gen == 6) {
866 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
867 dest.file == BRW_MESSAGE_REGISTER_FILE);
868 }
869 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
870 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
871 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
872
873 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
874 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
875 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
876 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
877 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
878 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
879 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
880 src0.vstride == BRW_VERTICAL_STRIDE_0);
881
882 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
883 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
884 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
885 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
886 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
887 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
888 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
889 src1.vstride == BRW_VERTICAL_STRIDE_0);
890
891 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
892 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
893 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
894 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
895 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
896 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
897 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
898 src2.vstride == BRW_VERTICAL_STRIDE_0);
899
900 if (devinfo->gen >= 7) {
901 /* Set both the source and destination types based on dest.type,
902 * ignoring the source register types. The MAD and LRP emitters ensure
903 * that all four types are float. The BFE and BFI2 emitters, however,
904 * may send us mixed D and UD types and want us to ignore that and use
905 * the destination type.
906 */
907 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
908 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
909
910 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
911 *
912 * "Three source instructions can use operands with mixed-mode
913 * precision. When SrcType field is set to :f or :hf it defines
914 * precision for source 0 only, and fields Src1Type and Src2Type
915 * define precision for other source operands:
916 *
917 * 0b = :f. Single precision Float (32-bit).
918 * 1b = :hf. Half precision Float (16-bit)."
919 */
920 if (src1.type == BRW_REGISTER_TYPE_HF)
921 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
922
923 if (src2.type == BRW_REGISTER_TYPE_HF)
924 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
925 }
926 }
927
928 return inst;
929 }
930
931
932 /***********************************************************************
933 * Convenience routines.
934 */
935 #define ALU1(OP) \
936 brw_inst *brw_##OP(struct brw_codegen *p, \
937 struct brw_reg dest, \
938 struct brw_reg src0) \
939 { \
940 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
941 }
942
943 #define ALU2(OP) \
944 brw_inst *brw_##OP(struct brw_codegen *p, \
945 struct brw_reg dest, \
946 struct brw_reg src0, \
947 struct brw_reg src1) \
948 { \
949 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
950 }
951
952 #define ALU3(OP) \
953 brw_inst *brw_##OP(struct brw_codegen *p, \
954 struct brw_reg dest, \
955 struct brw_reg src0, \
956 struct brw_reg src1, \
957 struct brw_reg src2) \
958 { \
959 if (p->current->access_mode == BRW_ALIGN_16) { \
960 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
961 src0.swizzle = BRW_SWIZZLE_XXXX; \
962 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
963 src1.swizzle = BRW_SWIZZLE_XXXX; \
964 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
965 src2.swizzle = BRW_SWIZZLE_XXXX; \
966 } \
967 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
968 }
969
970 #define ALU3F(OP) \
971 brw_inst *brw_##OP(struct brw_codegen *p, \
972 struct brw_reg dest, \
973 struct brw_reg src0, \
974 struct brw_reg src1, \
975 struct brw_reg src2) \
976 { \
977 assert(dest.type == BRW_REGISTER_TYPE_F || \
978 dest.type == BRW_REGISTER_TYPE_DF); \
979 if (dest.type == BRW_REGISTER_TYPE_F) { \
980 assert(src0.type == BRW_REGISTER_TYPE_F); \
981 assert(src1.type == BRW_REGISTER_TYPE_F); \
982 assert(src2.type == BRW_REGISTER_TYPE_F); \
983 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
984 assert(src0.type == BRW_REGISTER_TYPE_DF); \
985 assert(src1.type == BRW_REGISTER_TYPE_DF); \
986 assert(src2.type == BRW_REGISTER_TYPE_DF); \
987 } \
988 \
989 if (p->current->access_mode == BRW_ALIGN_16) { \
990 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
991 src0.swizzle = BRW_SWIZZLE_XXXX; \
992 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
993 src1.swizzle = BRW_SWIZZLE_XXXX; \
994 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
995 src2.swizzle = BRW_SWIZZLE_XXXX; \
996 } \
997 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
998 }
999
1000 /* Rounding operations (other than RNDD) require two instructions - the first
1001 * stores a rounded value (possibly the wrong way) in the dest register, but
1002 * also sets a per-channel "increment bit" in the flag register. A predicated
1003 * add of 1.0 fixes dest to contain the desired result.
1004 *
1005 * Sandybridge and later appear to round correctly without an ADD.
1006 */
1007 #define ROUND(OP) \
1008 void brw_##OP(struct brw_codegen *p, \
1009 struct brw_reg dest, \
1010 struct brw_reg src) \
1011 { \
1012 const struct gen_device_info *devinfo = p->devinfo; \
1013 brw_inst *rnd, *add; \
1014 rnd = next_insn(p, BRW_OPCODE_##OP); \
1015 brw_set_dest(p, rnd, dest); \
1016 brw_set_src0(p, rnd, src); \
1017 \
1018 if (devinfo->gen < 6) { \
1019 /* turn on round-increments */ \
1020 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1021 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1022 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1023 } \
1024 }
1025
1026
1027 ALU2(SEL)
1028 ALU1(NOT)
1029 ALU2(AND)
1030 ALU2(OR)
1031 ALU2(XOR)
1032 ALU2(SHR)
1033 ALU2(SHL)
1034 ALU1(DIM)
1035 ALU2(ASR)
1036 ALU2(ROL)
1037 ALU2(ROR)
1038 ALU3(CSEL)
1039 ALU1(FRC)
1040 ALU1(RNDD)
1041 ALU2(MAC)
1042 ALU2(MACH)
1043 ALU1(LZD)
1044 ALU2(DP4)
1045 ALU2(DPH)
1046 ALU2(DP3)
1047 ALU2(DP2)
1048 ALU3(MAD)
1049 ALU3F(LRP)
1050 ALU1(BFREV)
1051 ALU3(BFE)
1052 ALU2(BFI1)
1053 ALU3(BFI2)
1054 ALU1(FBH)
1055 ALU1(FBL)
1056 ALU1(CBIT)
1057 ALU2(ADDC)
1058 ALU2(SUBB)
1059
1060 ROUND(RNDZ)
1061 ROUND(RNDE)
1062
1063 brw_inst *
1064 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1065 {
1066 const struct gen_device_info *devinfo = p->devinfo;
1067
1068 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1069 * To avoid the problems that causes, we use an <X,2,0> source region to
1070 * read each element twice.
1071 */
1072 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1073 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1074 dest.type == BRW_REGISTER_TYPE_DF &&
1075 (src0.type == BRW_REGISTER_TYPE_F ||
1076 src0.type == BRW_REGISTER_TYPE_D ||
1077 src0.type == BRW_REGISTER_TYPE_UD) &&
1078 !has_scalar_region(src0)) {
1079 assert(src0.vstride == src0.width + src0.hstride);
1080 src0.vstride = src0.hstride;
1081 src0.width = BRW_WIDTH_2;
1082 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1083 }
1084
1085 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1086 }
1087
1088 brw_inst *
1089 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1090 struct brw_reg src0, struct brw_reg src1)
1091 {
1092 /* 6.2.2: add */
1093 if (src0.type == BRW_REGISTER_TYPE_F ||
1094 (src0.file == BRW_IMMEDIATE_VALUE &&
1095 src0.type == BRW_REGISTER_TYPE_VF)) {
1096 assert(src1.type != BRW_REGISTER_TYPE_UD);
1097 assert(src1.type != BRW_REGISTER_TYPE_D);
1098 }
1099
1100 if (src1.type == BRW_REGISTER_TYPE_F ||
1101 (src1.file == BRW_IMMEDIATE_VALUE &&
1102 src1.type == BRW_REGISTER_TYPE_VF)) {
1103 assert(src0.type != BRW_REGISTER_TYPE_UD);
1104 assert(src0.type != BRW_REGISTER_TYPE_D);
1105 }
1106
1107 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1108 }
1109
1110 brw_inst *
1111 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1112 struct brw_reg src0, struct brw_reg src1)
1113 {
1114 assert(dest.type == src0.type);
1115 assert(src0.type == src1.type);
1116 switch (src0.type) {
1117 case BRW_REGISTER_TYPE_B:
1118 case BRW_REGISTER_TYPE_UB:
1119 case BRW_REGISTER_TYPE_W:
1120 case BRW_REGISTER_TYPE_UW:
1121 case BRW_REGISTER_TYPE_D:
1122 case BRW_REGISTER_TYPE_UD:
1123 break;
1124 default:
1125 unreachable("Bad type for brw_AVG");
1126 }
1127
1128 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1129 }
1130
1131 brw_inst *
1132 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1133 struct brw_reg src0, struct brw_reg src1)
1134 {
1135 /* 6.32.38: mul */
1136 if (src0.type == BRW_REGISTER_TYPE_D ||
1137 src0.type == BRW_REGISTER_TYPE_UD ||
1138 src1.type == BRW_REGISTER_TYPE_D ||
1139 src1.type == BRW_REGISTER_TYPE_UD) {
1140 assert(dest.type != BRW_REGISTER_TYPE_F);
1141 }
1142
1143 if (src0.type == BRW_REGISTER_TYPE_F ||
1144 (src0.file == BRW_IMMEDIATE_VALUE &&
1145 src0.type == BRW_REGISTER_TYPE_VF)) {
1146 assert(src1.type != BRW_REGISTER_TYPE_UD);
1147 assert(src1.type != BRW_REGISTER_TYPE_D);
1148 }
1149
1150 if (src1.type == BRW_REGISTER_TYPE_F ||
1151 (src1.file == BRW_IMMEDIATE_VALUE &&
1152 src1.type == BRW_REGISTER_TYPE_VF)) {
1153 assert(src0.type != BRW_REGISTER_TYPE_UD);
1154 assert(src0.type != BRW_REGISTER_TYPE_D);
1155 }
1156
1157 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1158 src0.nr != BRW_ARF_ACCUMULATOR);
1159 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1160 src1.nr != BRW_ARF_ACCUMULATOR);
1161
1162 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1163 }
1164
1165 brw_inst *
1166 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1167 struct brw_reg src0, struct brw_reg src1)
1168 {
1169 src0.vstride = BRW_VERTICAL_STRIDE_0;
1170 src0.width = BRW_WIDTH_1;
1171 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1172 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1173 }
1174
1175 brw_inst *
1176 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1177 struct brw_reg src0, struct brw_reg src1)
1178 {
1179 src0.vstride = BRW_VERTICAL_STRIDE_0;
1180 src0.width = BRW_WIDTH_1;
1181 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1182 src1.vstride = BRW_VERTICAL_STRIDE_8;
1183 src1.width = BRW_WIDTH_8;
1184 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1185 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1186 }
1187
1188 brw_inst *
1189 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1190 {
1191 const struct gen_device_info *devinfo = p->devinfo;
1192 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1193 /* The F32TO16 instruction doesn't support 32-bit destination types in
1194 * Align1 mode, and neither does the Gen8 implementation in terms of a
1195 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1196 * an undocumented feature.
1197 */
1198 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1199 (!align16 || devinfo->gen >= 8));
1200 brw_inst *inst;
1201
1202 if (align16) {
1203 assert(dst.type == BRW_REGISTER_TYPE_UD);
1204 } else {
1205 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1206 dst.type == BRW_REGISTER_TYPE_W ||
1207 dst.type == BRW_REGISTER_TYPE_UW ||
1208 dst.type == BRW_REGISTER_TYPE_HF);
1209 }
1210
1211 brw_push_insn_state(p);
1212
1213 if (needs_zero_fill) {
1214 brw_set_default_access_mode(p, BRW_ALIGN_1);
1215 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1216 }
1217
1218 if (devinfo->gen >= 8) {
1219 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1220 } else {
1221 assert(devinfo->gen == 7);
1222 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1223 }
1224
1225 if (needs_zero_fill) {
1226 if (devinfo->gen < 12)
1227 brw_inst_set_no_dd_clear(devinfo, inst, true);
1228 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1229 if (devinfo->gen < 12)
1230 brw_inst_set_no_dd_check(devinfo, inst, true);
1231 }
1232
1233 brw_pop_insn_state(p);
1234 return inst;
1235 }
1236
1237 brw_inst *
1238 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1239 {
1240 const struct gen_device_info *devinfo = p->devinfo;
1241 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1242
1243 if (align16) {
1244 assert(src.type == BRW_REGISTER_TYPE_UD);
1245 } else {
1246 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1247 *
1248 * Because this instruction does not have a 16-bit floating-point
1249 * type, the source data type must be Word (W). The destination type
1250 * must be F (Float).
1251 */
1252 if (src.type == BRW_REGISTER_TYPE_UD)
1253 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1254
1255 assert(src.type == BRW_REGISTER_TYPE_W ||
1256 src.type == BRW_REGISTER_TYPE_UW ||
1257 src.type == BRW_REGISTER_TYPE_HF);
1258 }
1259
1260 if (devinfo->gen >= 8) {
1261 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1262 } else {
1263 assert(devinfo->gen == 7);
1264 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1265 }
1266 }
1267
1268
1269 void brw_NOP(struct brw_codegen *p)
1270 {
1271 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1272 memset(insn, 0, sizeof(*insn));
1273 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1274 }
1275
1276 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1277 {
1278 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1279 brw_inst_set_cond_modifier(p->devinfo, insn, func);
1280 }
1281
1282 /***********************************************************************
1283 * Comparisons, if/else/endif
1284 */
1285
1286 brw_inst *
1287 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1288 unsigned predicate_control)
1289 {
1290 const struct gen_device_info *devinfo = p->devinfo;
1291 struct brw_reg ip = brw_ip_reg();
1292 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1293
1294 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1295 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1296 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1297 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1298
1299 return inst;
1300 }
1301
1302 static void
1303 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1304 {
1305 p->if_stack[p->if_stack_depth] = inst - p->store;
1306
1307 p->if_stack_depth++;
1308 if (p->if_stack_array_size <= p->if_stack_depth) {
1309 p->if_stack_array_size *= 2;
1310 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1311 p->if_stack_array_size);
1312 }
1313 }
1314
1315 static brw_inst *
1316 pop_if_stack(struct brw_codegen *p)
1317 {
1318 p->if_stack_depth--;
1319 return &p->store[p->if_stack[p->if_stack_depth]];
1320 }
1321
1322 static void
1323 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1324 {
1325 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1326 p->loop_stack_array_size *= 2;
1327 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1328 p->loop_stack_array_size);
1329 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1330 p->loop_stack_array_size);
1331 }
1332
1333 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1334 p->loop_stack_depth++;
1335 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1336 }
1337
1338 static brw_inst *
1339 get_inner_do_insn(struct brw_codegen *p)
1340 {
1341 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1342 }
1343
1344 /* EU takes the value from the flag register and pushes it onto some
1345 * sort of a stack (presumably merging with any flag value already on
1346 * the stack). Within an if block, the flags at the top of the stack
1347 * control execution on each channel of the unit, eg. on each of the
1348 * 16 pixel values in our wm programs.
1349 *
1350 * When the matching 'else' instruction is reached (presumably by
1351 * countdown of the instruction count patched in by our ELSE/ENDIF
1352 * functions), the relevant flags are inverted.
1353 *
1354 * When the matching 'endif' instruction is reached, the flags are
1355 * popped off. If the stack is now empty, normal execution resumes.
1356 */
1357 brw_inst *
1358 brw_IF(struct brw_codegen *p, unsigned execute_size)
1359 {
1360 const struct gen_device_info *devinfo = p->devinfo;
1361 brw_inst *insn;
1362
1363 insn = next_insn(p, BRW_OPCODE_IF);
1364
1365 /* Override the defaults for this instruction:
1366 */
1367 if (devinfo->gen < 6) {
1368 brw_set_dest(p, insn, brw_ip_reg());
1369 brw_set_src0(p, insn, brw_ip_reg());
1370 brw_set_src1(p, insn, brw_imm_d(0x0));
1371 } else if (devinfo->gen == 6) {
1372 brw_set_dest(p, insn, brw_imm_w(0));
1373 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1374 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1375 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1376 } else if (devinfo->gen == 7) {
1377 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1378 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1379 brw_set_src1(p, insn, brw_imm_w(0));
1380 brw_inst_set_jip(devinfo, insn, 0);
1381 brw_inst_set_uip(devinfo, insn, 0);
1382 } else {
1383 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1384 if (devinfo->gen < 12)
1385 brw_set_src0(p, insn, brw_imm_d(0));
1386 brw_inst_set_jip(devinfo, insn, 0);
1387 brw_inst_set_uip(devinfo, insn, 0);
1388 }
1389
1390 brw_inst_set_exec_size(devinfo, insn, execute_size);
1391 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1392 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1393 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1394 if (!p->single_program_flow && devinfo->gen < 6)
1395 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1396
1397 push_if_stack(p, insn);
1398 p->if_depth_in_loop[p->loop_stack_depth]++;
1399 return insn;
1400 }
1401
1402 /* This function is only used for gen6-style IF instructions with an
1403 * embedded comparison (conditional modifier). It is not used on gen7.
1404 */
1405 brw_inst *
1406 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1407 struct brw_reg src0, struct brw_reg src1)
1408 {
1409 const struct gen_device_info *devinfo = p->devinfo;
1410 brw_inst *insn;
1411
1412 insn = next_insn(p, BRW_OPCODE_IF);
1413
1414 brw_set_dest(p, insn, brw_imm_w(0));
1415 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1416 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1417 brw_set_src0(p, insn, src0);
1418 brw_set_src1(p, insn, src1);
1419
1420 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1421 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1422 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1423
1424 push_if_stack(p, insn);
1425 return insn;
1426 }
1427
1428 /**
1429 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1430 */
1431 static void
1432 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1433 brw_inst *if_inst, brw_inst *else_inst)
1434 {
1435 const struct gen_device_info *devinfo = p->devinfo;
1436
1437 /* The next instruction (where the ENDIF would be, if it existed) */
1438 brw_inst *next_inst = &p->store[p->nr_insn];
1439
1440 assert(p->single_program_flow);
1441 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1442 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1443 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1444
1445 /* Convert IF to an ADD instruction that moves the instruction pointer
1446 * to the first instruction of the ELSE block. If there is no ELSE
1447 * block, point to where ENDIF would be. Reverse the predicate.
1448 *
1449 * There's no need to execute an ENDIF since we don't need to do any
1450 * stack operations, and if we're currently executing, we just want to
1451 * continue normally.
1452 */
1453 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1454 brw_inst_set_pred_inv(devinfo, if_inst, true);
1455
1456 if (else_inst != NULL) {
1457 /* Convert ELSE to an ADD instruction that points where the ENDIF
1458 * would be.
1459 */
1460 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1461
1462 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1463 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1464 } else {
1465 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1466 }
1467 }
1468
1469 /**
1470 * Patch IF and ELSE instructions with appropriate jump targets.
1471 */
1472 static void
1473 patch_IF_ELSE(struct brw_codegen *p,
1474 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1475 {
1476 const struct gen_device_info *devinfo = p->devinfo;
1477
1478 /* We shouldn't be patching IF and ELSE instructions in single program flow
1479 * mode when gen < 6, because in single program flow mode on those
1480 * platforms, we convert flow control instructions to conditional ADDs that
1481 * operate on IP (see brw_ENDIF).
1482 *
1483 * However, on Gen6, writing to IP doesn't work in single program flow mode
1484 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1485 * not be updated by non-flow control instructions."). And on later
1486 * platforms, there is no significant benefit to converting control flow
1487 * instructions to conditional ADDs. So we do patch IF and ELSE
1488 * instructions in single program flow mode on those platforms.
1489 */
1490 if (devinfo->gen < 6)
1491 assert(!p->single_program_flow);
1492
1493 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1494 assert(endif_inst != NULL);
1495 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1496
1497 unsigned br = brw_jump_scale(devinfo);
1498
1499 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1500 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1501
1502 if (else_inst == NULL) {
1503 /* Patch IF -> ENDIF */
1504 if (devinfo->gen < 6) {
1505 /* Turn it into an IFF, which means no mask stack operations for
1506 * all-false and jumping past the ENDIF.
1507 */
1508 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1509 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1510 br * (endif_inst - if_inst + 1));
1511 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1512 } else if (devinfo->gen == 6) {
1513 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1514 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1515 } else {
1516 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1517 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1518 }
1519 } else {
1520 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1521
1522 /* Patch IF -> ELSE */
1523 if (devinfo->gen < 6) {
1524 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1525 br * (else_inst - if_inst));
1526 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1527 } else if (devinfo->gen == 6) {
1528 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1529 br * (else_inst - if_inst + 1));
1530 }
1531
1532 /* Patch ELSE -> ENDIF */
1533 if (devinfo->gen < 6) {
1534 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1535 * matching ENDIF.
1536 */
1537 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1538 br * (endif_inst - else_inst + 1));
1539 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1540 } else if (devinfo->gen == 6) {
1541 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1542 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1543 br * (endif_inst - else_inst));
1544 } else {
1545 /* The IF instruction's JIP should point just past the ELSE */
1546 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1547 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1548 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1549 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1550 if (devinfo->gen >= 8) {
1551 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1552 * should point to ENDIF.
1553 */
1554 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1555 }
1556 }
1557 }
1558 }
1559
1560 void
1561 brw_ELSE(struct brw_codegen *p)
1562 {
1563 const struct gen_device_info *devinfo = p->devinfo;
1564 brw_inst *insn;
1565
1566 insn = next_insn(p, BRW_OPCODE_ELSE);
1567
1568 if (devinfo->gen < 6) {
1569 brw_set_dest(p, insn, brw_ip_reg());
1570 brw_set_src0(p, insn, brw_ip_reg());
1571 brw_set_src1(p, insn, brw_imm_d(0x0));
1572 } else if (devinfo->gen == 6) {
1573 brw_set_dest(p, insn, brw_imm_w(0));
1574 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1575 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1576 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1577 } else if (devinfo->gen == 7) {
1578 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1579 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1580 brw_set_src1(p, insn, brw_imm_w(0));
1581 brw_inst_set_jip(devinfo, insn, 0);
1582 brw_inst_set_uip(devinfo, insn, 0);
1583 } else {
1584 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1585 if (devinfo->gen < 12)
1586 brw_set_src0(p, insn, brw_imm_d(0));
1587 brw_inst_set_jip(devinfo, insn, 0);
1588 brw_inst_set_uip(devinfo, insn, 0);
1589 }
1590
1591 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1592 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1593 if (!p->single_program_flow && devinfo->gen < 6)
1594 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1595
1596 push_if_stack(p, insn);
1597 }
1598
1599 void
1600 brw_ENDIF(struct brw_codegen *p)
1601 {
1602 const struct gen_device_info *devinfo = p->devinfo;
1603 brw_inst *insn = NULL;
1604 brw_inst *else_inst = NULL;
1605 brw_inst *if_inst = NULL;
1606 brw_inst *tmp;
1607 bool emit_endif = true;
1608
1609 /* In single program flow mode, we can express IF and ELSE instructions
1610 * equivalently as ADD instructions that operate on IP. On platforms prior
1611 * to Gen6, flow control instructions cause an implied thread switch, so
1612 * this is a significant savings.
1613 *
1614 * However, on Gen6, writing to IP doesn't work in single program flow mode
1615 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1616 * not be updated by non-flow control instructions."). And on later
1617 * platforms, there is no significant benefit to converting control flow
1618 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1619 * Gen5.
1620 */
1621 if (devinfo->gen < 6 && p->single_program_flow)
1622 emit_endif = false;
1623
1624 /*
1625 * A single next_insn() may change the base address of instruction store
1626 * memory(p->store), so call it first before referencing the instruction
1627 * store pointer from an index
1628 */
1629 if (emit_endif)
1630 insn = next_insn(p, BRW_OPCODE_ENDIF);
1631
1632 /* Pop the IF and (optional) ELSE instructions from the stack */
1633 p->if_depth_in_loop[p->loop_stack_depth]--;
1634 tmp = pop_if_stack(p);
1635 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1636 else_inst = tmp;
1637 tmp = pop_if_stack(p);
1638 }
1639 if_inst = tmp;
1640
1641 if (!emit_endif) {
1642 /* ENDIF is useless; don't bother emitting it. */
1643 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1644 return;
1645 }
1646
1647 if (devinfo->gen < 6) {
1648 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1649 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1650 brw_set_src1(p, insn, brw_imm_d(0x0));
1651 } else if (devinfo->gen == 6) {
1652 brw_set_dest(p, insn, brw_imm_w(0));
1653 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1654 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1655 } else if (devinfo->gen == 7) {
1656 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1657 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1658 brw_set_src1(p, insn, brw_imm_w(0));
1659 } else {
1660 brw_set_src0(p, insn, brw_imm_d(0));
1661 }
1662
1663 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1664 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1665 if (devinfo->gen < 6)
1666 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1667
1668 /* Also pop item off the stack in the endif instruction: */
1669 if (devinfo->gen < 6) {
1670 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1671 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1672 } else if (devinfo->gen == 6) {
1673 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1674 } else {
1675 brw_inst_set_jip(devinfo, insn, 2);
1676 }
1677 patch_IF_ELSE(p, if_inst, else_inst, insn);
1678 }
1679
1680 brw_inst *
1681 brw_BREAK(struct brw_codegen *p)
1682 {
1683 const struct gen_device_info *devinfo = p->devinfo;
1684 brw_inst *insn;
1685
1686 insn = next_insn(p, BRW_OPCODE_BREAK);
1687 if (devinfo->gen >= 8) {
1688 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1689 brw_set_src0(p, insn, brw_imm_d(0x0));
1690 } else if (devinfo->gen >= 6) {
1691 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1692 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1693 brw_set_src1(p, insn, brw_imm_d(0x0));
1694 } else {
1695 brw_set_dest(p, insn, brw_ip_reg());
1696 brw_set_src0(p, insn, brw_ip_reg());
1697 brw_set_src1(p, insn, brw_imm_d(0x0));
1698 brw_inst_set_gen4_pop_count(devinfo, insn,
1699 p->if_depth_in_loop[p->loop_stack_depth]);
1700 }
1701 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1702 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1703
1704 return insn;
1705 }
1706
1707 brw_inst *
1708 brw_CONT(struct brw_codegen *p)
1709 {
1710 const struct gen_device_info *devinfo = p->devinfo;
1711 brw_inst *insn;
1712
1713 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1714 brw_set_dest(p, insn, brw_ip_reg());
1715 if (devinfo->gen >= 8) {
1716 brw_set_src0(p, insn, brw_imm_d(0x0));
1717 } else {
1718 brw_set_src0(p, insn, brw_ip_reg());
1719 brw_set_src1(p, insn, brw_imm_d(0x0));
1720 }
1721
1722 if (devinfo->gen < 6) {
1723 brw_inst_set_gen4_pop_count(devinfo, insn,
1724 p->if_depth_in_loop[p->loop_stack_depth]);
1725 }
1726 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1727 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1728 return insn;
1729 }
1730
1731 brw_inst *
1732 gen6_HALT(struct brw_codegen *p)
1733 {
1734 const struct gen_device_info *devinfo = p->devinfo;
1735 brw_inst *insn;
1736
1737 insn = next_insn(p, BRW_OPCODE_HALT);
1738 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1739 if (devinfo->gen < 8) {
1740 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1741 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1742 } else if (devinfo->gen < 12) {
1743 brw_set_src0(p, insn, brw_imm_d(0x0));
1744 }
1745
1746 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1747 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1748 return insn;
1749 }
1750
1751 /* DO/WHILE loop:
1752 *
1753 * The DO/WHILE is just an unterminated loop -- break or continue are
1754 * used for control within the loop. We have a few ways they can be
1755 * done.
1756 *
1757 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1758 * jip and no DO instruction.
1759 *
1760 * For non-uniform control flow pre-gen6, there's a DO instruction to
1761 * push the mask, and a WHILE to jump back, and BREAK to get out and
1762 * pop the mask.
1763 *
1764 * For gen6, there's no more mask stack, so no need for DO. WHILE
1765 * just points back to the first instruction of the loop.
1766 */
1767 brw_inst *
1768 brw_DO(struct brw_codegen *p, unsigned execute_size)
1769 {
1770 const struct gen_device_info *devinfo = p->devinfo;
1771
1772 if (devinfo->gen >= 6 || p->single_program_flow) {
1773 push_loop_stack(p, &p->store[p->nr_insn]);
1774 return &p->store[p->nr_insn];
1775 } else {
1776 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1777
1778 push_loop_stack(p, insn);
1779
1780 /* Override the defaults for this instruction:
1781 */
1782 brw_set_dest(p, insn, brw_null_reg());
1783 brw_set_src0(p, insn, brw_null_reg());
1784 brw_set_src1(p, insn, brw_null_reg());
1785
1786 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1787 brw_inst_set_exec_size(devinfo, insn, execute_size);
1788 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1789
1790 return insn;
1791 }
1792 }
1793
1794 /**
1795 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1796 * instruction here.
1797 *
1798 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1799 * nesting, since it can always just point to the end of the block/current loop.
1800 */
1801 static void
1802 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1803 {
1804 const struct gen_device_info *devinfo = p->devinfo;
1805 brw_inst *do_inst = get_inner_do_insn(p);
1806 brw_inst *inst;
1807 unsigned br = brw_jump_scale(devinfo);
1808
1809 assert(devinfo->gen < 6);
1810
1811 for (inst = while_inst - 1; inst != do_inst; inst--) {
1812 /* If the jump count is != 0, that means that this instruction has already
1813 * been patched because it's part of a loop inside of the one we're
1814 * patching.
1815 */
1816 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1817 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1818 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1819 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1820 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1821 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1822 }
1823 }
1824 }
1825
1826 brw_inst *
1827 brw_WHILE(struct brw_codegen *p)
1828 {
1829 const struct gen_device_info *devinfo = p->devinfo;
1830 brw_inst *insn, *do_insn;
1831 unsigned br = brw_jump_scale(devinfo);
1832
1833 if (devinfo->gen >= 6) {
1834 insn = next_insn(p, BRW_OPCODE_WHILE);
1835 do_insn = get_inner_do_insn(p);
1836
1837 if (devinfo->gen >= 8) {
1838 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1839 if (devinfo->gen < 12)
1840 brw_set_src0(p, insn, brw_imm_d(0));
1841 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1842 } else if (devinfo->gen == 7) {
1843 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1844 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1845 brw_set_src1(p, insn, brw_imm_w(0));
1846 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1847 } else {
1848 brw_set_dest(p, insn, brw_imm_w(0));
1849 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1850 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1851 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1852 }
1853
1854 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1855
1856 } else {
1857 if (p->single_program_flow) {
1858 insn = next_insn(p, BRW_OPCODE_ADD);
1859 do_insn = get_inner_do_insn(p);
1860
1861 brw_set_dest(p, insn, brw_ip_reg());
1862 brw_set_src0(p, insn, brw_ip_reg());
1863 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1864 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1865 } else {
1866 insn = next_insn(p, BRW_OPCODE_WHILE);
1867 do_insn = get_inner_do_insn(p);
1868
1869 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1870
1871 brw_set_dest(p, insn, brw_ip_reg());
1872 brw_set_src0(p, insn, brw_ip_reg());
1873 brw_set_src1(p, insn, brw_imm_d(0));
1874
1875 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1876 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1877 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1878
1879 brw_patch_break_cont(p, insn);
1880 }
1881 }
1882 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1883
1884 p->loop_stack_depth--;
1885
1886 return insn;
1887 }
1888
1889 /* FORWARD JUMPS:
1890 */
1891 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1892 {
1893 const struct gen_device_info *devinfo = p->devinfo;
1894 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1895 unsigned jmpi = 1;
1896
1897 if (devinfo->gen >= 5)
1898 jmpi = 2;
1899
1900 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1901 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1902
1903 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1904 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1905 }
1906
1907 /* To integrate with the above, it makes sense that the comparison
1908 * instruction should populate the flag register. It might be simpler
1909 * just to use the flag reg for most WM tasks?
1910 */
1911 void brw_CMP(struct brw_codegen *p,
1912 struct brw_reg dest,
1913 unsigned conditional,
1914 struct brw_reg src0,
1915 struct brw_reg src1)
1916 {
1917 const struct gen_device_info *devinfo = p->devinfo;
1918 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1919
1920 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1921 brw_set_dest(p, insn, dest);
1922 brw_set_src0(p, insn, src0);
1923 brw_set_src1(p, insn, src1);
1924
1925 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1926 * page says:
1927 * "Any CMP instruction with a null destination must use a {switch}."
1928 *
1929 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1930 * mentioned on their work-arounds pages.
1931 */
1932 if (devinfo->gen == 7) {
1933 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1934 dest.nr == BRW_ARF_NULL) {
1935 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1936 }
1937 }
1938 }
1939
1940 /***********************************************************************
1941 * Helpers for the various SEND message types:
1942 */
1943
1944 /** Extended math function, float[8].
1945 */
1946 void gen4_math(struct brw_codegen *p,
1947 struct brw_reg dest,
1948 unsigned function,
1949 unsigned msg_reg_nr,
1950 struct brw_reg src,
1951 unsigned precision )
1952 {
1953 const struct gen_device_info *devinfo = p->devinfo;
1954 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1955 unsigned data_type;
1956 if (has_scalar_region(src)) {
1957 data_type = BRW_MATH_DATA_SCALAR;
1958 } else {
1959 data_type = BRW_MATH_DATA_VECTOR;
1960 }
1961
1962 assert(devinfo->gen < 6);
1963
1964 /* Example code doesn't set predicate_control for send
1965 * instructions.
1966 */
1967 brw_inst_set_pred_control(devinfo, insn, 0);
1968 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1969
1970 brw_set_dest(p, insn, dest);
1971 brw_set_src0(p, insn, src);
1972 brw_set_math_message(p,
1973 insn,
1974 function,
1975 src.type == BRW_REGISTER_TYPE_D,
1976 precision,
1977 data_type);
1978 }
1979
1980 void gen6_math(struct brw_codegen *p,
1981 struct brw_reg dest,
1982 unsigned function,
1983 struct brw_reg src0,
1984 struct brw_reg src1)
1985 {
1986 const struct gen_device_info *devinfo = p->devinfo;
1987 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1988
1989 assert(devinfo->gen >= 6);
1990
1991 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1992 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1993
1994 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1995 if (devinfo->gen == 6) {
1996 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1997 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1998 }
1999
2000 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2001 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2002 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2003 assert(src0.type != BRW_REGISTER_TYPE_F);
2004 assert(src1.type != BRW_REGISTER_TYPE_F);
2005 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2006 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2007 } else {
2008 assert(src0.type == BRW_REGISTER_TYPE_F ||
2009 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2010 assert(src1.type == BRW_REGISTER_TYPE_F ||
2011 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2012 }
2013
2014 /* Source modifiers are ignored for extended math instructions on Gen6. */
2015 if (devinfo->gen == 6) {
2016 assert(!src0.negate);
2017 assert(!src0.abs);
2018 assert(!src1.negate);
2019 assert(!src1.abs);
2020 }
2021
2022 brw_inst_set_math_function(devinfo, insn, function);
2023
2024 brw_set_dest(p, insn, dest);
2025 brw_set_src0(p, insn, src0);
2026 brw_set_src1(p, insn, src1);
2027 }
2028
2029 /**
2030 * Return the right surface index to access the thread scratch space using
2031 * stateless dataport messages.
2032 */
2033 unsigned
2034 brw_scratch_surface_idx(const struct brw_codegen *p)
2035 {
2036 /* The scratch space is thread-local so IA coherency is unnecessary. */
2037 if (p->devinfo->gen >= 8)
2038 return GEN8_BTI_STATELESS_NON_COHERENT;
2039 else
2040 return BRW_BTI_STATELESS;
2041 }
2042
2043 /**
2044 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2045 * using a constant offset per channel.
2046 *
2047 * The offset must be aligned to oword size (16 bytes). Used for
2048 * register spilling.
2049 */
2050 void brw_oword_block_write_scratch(struct brw_codegen *p,
2051 struct brw_reg mrf,
2052 int num_regs,
2053 unsigned offset)
2054 {
2055 const struct gen_device_info *devinfo = p->devinfo;
2056 const unsigned target_cache =
2057 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2058 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2059 BRW_SFID_DATAPORT_WRITE);
2060 uint32_t msg_type;
2061
2062 if (devinfo->gen >= 6)
2063 offset /= 16;
2064
2065 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2066
2067 const unsigned mlen = 1 + num_regs;
2068
2069 /* Set up the message header. This is g0, with g0.2 filled with
2070 * the offset. We don't want to leave our offset around in g0 or
2071 * it'll screw up texture samples, so set it up inside the message
2072 * reg.
2073 */
2074 {
2075 brw_push_insn_state(p);
2076 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2077 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2078 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2079
2080 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2081
2082 /* set message header global offset field (reg 0, element 2) */
2083 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2084 brw_MOV(p,
2085 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2086 mrf.nr,
2087 2), BRW_REGISTER_TYPE_UD),
2088 brw_imm_ud(offset));
2089
2090 brw_pop_insn_state(p);
2091 }
2092
2093 {
2094 struct brw_reg dest;
2095 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2096 int send_commit_msg;
2097 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2098 BRW_REGISTER_TYPE_UW);
2099
2100 brw_inst_set_sfid(devinfo, insn, target_cache);
2101 brw_inst_set_compression(devinfo, insn, false);
2102
2103 if (brw_inst_exec_size(devinfo, insn) >= 16)
2104 src_header = vec16(src_header);
2105
2106 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2107 if (devinfo->gen < 6)
2108 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2109
2110 /* Until gen6, writes followed by reads from the same location
2111 * are not guaranteed to be ordered unless write_commit is set.
2112 * If set, then a no-op write is issued to the destination
2113 * register to set a dependency, and a read from the destination
2114 * can be used to ensure the ordering.
2115 *
2116 * For gen6, only writes between different threads need ordering
2117 * protection. Our use of DP writes is all about register
2118 * spilling within a thread.
2119 */
2120 if (devinfo->gen >= 6) {
2121 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2122 send_commit_msg = 0;
2123 } else {
2124 dest = src_header;
2125 send_commit_msg = 1;
2126 }
2127
2128 brw_set_dest(p, insn, dest);
2129 if (devinfo->gen >= 6) {
2130 brw_set_src0(p, insn, mrf);
2131 } else {
2132 brw_set_src0(p, insn, brw_null_reg());
2133 }
2134
2135 if (devinfo->gen >= 6)
2136 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2137 else
2138 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2139
2140 brw_set_desc(p, insn,
2141 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2142 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2143 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2144 msg_type, 0, /* not a render target */
2145 send_commit_msg));
2146 }
2147 }
2148
2149
2150 /**
2151 * Read a block of owords (half a GRF each) from the scratch buffer
2152 * using a constant index per channel.
2153 *
2154 * Offset must be aligned to oword size (16 bytes). Used for register
2155 * spilling.
2156 */
2157 void
2158 brw_oword_block_read_scratch(struct brw_codegen *p,
2159 struct brw_reg dest,
2160 struct brw_reg mrf,
2161 int num_regs,
2162 unsigned offset)
2163 {
2164 const struct gen_device_info *devinfo = p->devinfo;
2165
2166 if (devinfo->gen >= 6)
2167 offset /= 16;
2168
2169 if (p->devinfo->gen >= 7) {
2170 /* On gen 7 and above, we no longer have message registers and we can
2171 * send from any register we want. By using the destination register
2172 * for the message, we guarantee that the implied message write won't
2173 * accidentally overwrite anything. This has been a problem because
2174 * the MRF registers and source for the final FB write are both fixed
2175 * and may overlap.
2176 */
2177 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2178 } else {
2179 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2180 }
2181 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2182
2183 const unsigned rlen = num_regs;
2184 const unsigned target_cache =
2185 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2186 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2187 BRW_SFID_DATAPORT_READ);
2188
2189 {
2190 brw_push_insn_state(p);
2191 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2192 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2193 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2194
2195 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2196
2197 /* set message header global offset field (reg 0, element 2) */
2198 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2199 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2200
2201 brw_pop_insn_state(p);
2202 }
2203
2204 {
2205 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2206
2207 brw_inst_set_sfid(devinfo, insn, target_cache);
2208 assert(brw_inst_pred_control(devinfo, insn) == 0);
2209 brw_inst_set_compression(devinfo, insn, false);
2210
2211 brw_set_dest(p, insn, dest); /* UW? */
2212 if (devinfo->gen >= 6) {
2213 brw_set_src0(p, insn, mrf);
2214 } else {
2215 brw_set_src0(p, insn, brw_null_reg());
2216 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2217 }
2218
2219 brw_set_desc(p, insn,
2220 brw_message_desc(devinfo, 1, rlen, true) |
2221 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2222 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2223 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2224 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2225 }
2226 }
2227
2228 void
2229 gen7_block_read_scratch(struct brw_codegen *p,
2230 struct brw_reg dest,
2231 int num_regs,
2232 unsigned offset)
2233 {
2234 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2235 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2236
2237 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2238
2239 /* The HW requires that the header is present; this is to get the g0.5
2240 * scratch offset.
2241 */
2242 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2243
2244 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2245 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2246 * is 32 bytes, which happens to be the size of a register.
2247 */
2248 offset /= REG_SIZE;
2249 assert(offset < (1 << 12));
2250
2251 gen7_set_dp_scratch_message(p, insn,
2252 false, /* scratch read */
2253 false, /* OWords */
2254 false, /* invalidate after read */
2255 num_regs,
2256 offset,
2257 1, /* mlen: just g0 */
2258 num_regs, /* rlen */
2259 true); /* header present */
2260 }
2261
2262 /**
2263 * Read float[4] vectors from the data port constant cache.
2264 * Location (in buffer) should be a multiple of 16.
2265 * Used for fetching shader constants.
2266 */
2267 void brw_oword_block_read(struct brw_codegen *p,
2268 struct brw_reg dest,
2269 struct brw_reg mrf,
2270 uint32_t offset,
2271 uint32_t bind_table_index)
2272 {
2273 const struct gen_device_info *devinfo = p->devinfo;
2274 const unsigned target_cache =
2275 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2276 BRW_SFID_DATAPORT_READ);
2277 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2278
2279 /* On newer hardware, offset is in units of owords. */
2280 if (devinfo->gen >= 6)
2281 offset /= 16;
2282
2283 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2284
2285 brw_push_insn_state(p);
2286 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2287 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2288 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2289
2290 brw_push_insn_state(p);
2291 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2292 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2293
2294 /* set message header global offset field (reg 0, element 2) */
2295 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2296 brw_MOV(p,
2297 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2298 mrf.nr,
2299 2), BRW_REGISTER_TYPE_UD),
2300 brw_imm_ud(offset));
2301 brw_pop_insn_state(p);
2302
2303 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2304
2305 brw_inst_set_sfid(devinfo, insn, target_cache);
2306
2307 /* cast dest to a uword[8] vector */
2308 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2309
2310 brw_set_dest(p, insn, dest);
2311 if (devinfo->gen >= 6) {
2312 brw_set_src0(p, insn, mrf);
2313 } else {
2314 brw_set_src0(p, insn, brw_null_reg());
2315 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2316 }
2317
2318 brw_set_desc(p, insn,
2319 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2320 brw_dp_read_desc(devinfo, bind_table_index,
2321 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2322 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2323 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2324
2325 brw_pop_insn_state(p);
2326 }
2327
2328 brw_inst *
2329 brw_fb_WRITE(struct brw_codegen *p,
2330 struct brw_reg payload,
2331 struct brw_reg implied_header,
2332 unsigned msg_control,
2333 unsigned binding_table_index,
2334 unsigned msg_length,
2335 unsigned response_length,
2336 bool eot,
2337 bool last_render_target,
2338 bool header_present)
2339 {
2340 const struct gen_device_info *devinfo = p->devinfo;
2341 const unsigned target_cache =
2342 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2343 BRW_SFID_DATAPORT_WRITE);
2344 brw_inst *insn;
2345 unsigned msg_type;
2346 struct brw_reg dest, src0;
2347
2348 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2349 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2350 else
2351 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2352
2353 if (devinfo->gen >= 6) {
2354 insn = next_insn(p, BRW_OPCODE_SENDC);
2355 } else {
2356 insn = next_insn(p, BRW_OPCODE_SEND);
2357 }
2358 brw_inst_set_sfid(devinfo, insn, target_cache);
2359 brw_inst_set_compression(devinfo, insn, false);
2360
2361 if (devinfo->gen >= 6) {
2362 /* headerless version, just submit color payload */
2363 src0 = payload;
2364
2365 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2366 } else {
2367 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2368 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2369 src0 = implied_header;
2370
2371 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2372 }
2373
2374 brw_set_dest(p, insn, dest);
2375 brw_set_src0(p, insn, src0);
2376 brw_set_desc(p, insn,
2377 brw_message_desc(devinfo, msg_length, response_length,
2378 header_present) |
2379 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2380 msg_type, last_render_target,
2381 0 /* send_commit_msg */));
2382 brw_inst_set_eot(devinfo, insn, eot);
2383
2384 return insn;
2385 }
2386
2387 brw_inst *
2388 gen9_fb_READ(struct brw_codegen *p,
2389 struct brw_reg dst,
2390 struct brw_reg payload,
2391 unsigned binding_table_index,
2392 unsigned msg_length,
2393 unsigned response_length,
2394 bool per_sample)
2395 {
2396 const struct gen_device_info *devinfo = p->devinfo;
2397 assert(devinfo->gen >= 9);
2398 const unsigned msg_subtype =
2399 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2400 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2401
2402 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2403 brw_set_dest(p, insn, dst);
2404 brw_set_src0(p, insn, payload);
2405 brw_set_desc(
2406 p, insn,
2407 brw_message_desc(devinfo, msg_length, response_length, true) |
2408 brw_dp_read_desc(devinfo, binding_table_index,
2409 per_sample << 5 | msg_subtype,
2410 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2411 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2412 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2413
2414 return insn;
2415 }
2416
2417 /**
2418 * Texture sample instruction.
2419 * Note: the msg_type plus msg_length values determine exactly what kind
2420 * of sampling operation is performed. See volume 4, page 161 of docs.
2421 */
2422 void brw_SAMPLE(struct brw_codegen *p,
2423 struct brw_reg dest,
2424 unsigned msg_reg_nr,
2425 struct brw_reg src0,
2426 unsigned binding_table_index,
2427 unsigned sampler,
2428 unsigned msg_type,
2429 unsigned response_length,
2430 unsigned msg_length,
2431 unsigned header_present,
2432 unsigned simd_mode,
2433 unsigned return_format)
2434 {
2435 const struct gen_device_info *devinfo = p->devinfo;
2436 brw_inst *insn;
2437
2438 if (msg_reg_nr != -1)
2439 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2440
2441 insn = next_insn(p, BRW_OPCODE_SEND);
2442 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2443 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2444
2445 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2446 *
2447 * "Instruction compression is not allowed for this instruction (that
2448 * is, send). The hardware behavior is undefined if this instruction is
2449 * set as compressed. However, compress control can be set to "SecHalf"
2450 * to affect the EMask generation."
2451 *
2452 * No similar wording is found in later PRMs, but there are examples
2453 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2454 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2455 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2456 */
2457 brw_inst_set_compression(devinfo, insn, false);
2458
2459 if (devinfo->gen < 6)
2460 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2461
2462 brw_set_dest(p, insn, dest);
2463 brw_set_src0(p, insn, src0);
2464 brw_set_desc(p, insn,
2465 brw_message_desc(devinfo, msg_length, response_length,
2466 header_present) |
2467 brw_sampler_desc(devinfo, binding_table_index, sampler,
2468 msg_type, simd_mode, return_format));
2469 }
2470
2471 /* Adjust the message header's sampler state pointer to
2472 * select the correct group of 16 samplers.
2473 */
2474 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2475 struct brw_reg header,
2476 struct brw_reg sampler_index)
2477 {
2478 /* The "Sampler Index" field can only store values between 0 and 15.
2479 * However, we can add an offset to the "Sampler State Pointer"
2480 * field, effectively selecting a different set of 16 samplers.
2481 *
2482 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2483 * offset, and each sampler state is only 16-bytes, so we can't
2484 * exclusively use the offset - we have to use both.
2485 */
2486
2487 const struct gen_device_info *devinfo = p->devinfo;
2488
2489 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2490 const int sampler_state_size = 16; /* 16 bytes */
2491 uint32_t sampler = sampler_index.ud;
2492
2493 if (sampler >= 16) {
2494 assert(devinfo->is_haswell || devinfo->gen >= 8);
2495 brw_ADD(p,
2496 get_element_ud(header, 3),
2497 get_element_ud(brw_vec8_grf(0, 0), 3),
2498 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2499 }
2500 } else {
2501 /* Non-const sampler array indexing case */
2502 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2503 return;
2504 }
2505
2506 struct brw_reg temp = get_element_ud(header, 3);
2507
2508 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2509 brw_SHL(p, temp, temp, brw_imm_ud(4));
2510 brw_ADD(p,
2511 get_element_ud(header, 3),
2512 get_element_ud(brw_vec8_grf(0, 0), 3),
2513 temp);
2514 }
2515 }
2516
2517 /* All these variables are pretty confusing - we might be better off
2518 * using bitmasks and macros for this, in the old style. Or perhaps
2519 * just having the caller instantiate the fields in dword3 itself.
2520 */
2521 void brw_urb_WRITE(struct brw_codegen *p,
2522 struct brw_reg dest,
2523 unsigned msg_reg_nr,
2524 struct brw_reg src0,
2525 enum brw_urb_write_flags flags,
2526 unsigned msg_length,
2527 unsigned response_length,
2528 unsigned offset,
2529 unsigned swizzle)
2530 {
2531 const struct gen_device_info *devinfo = p->devinfo;
2532 brw_inst *insn;
2533
2534 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2535
2536 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2537 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2538 brw_push_insn_state(p);
2539 brw_set_default_access_mode(p, BRW_ALIGN_1);
2540 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2541 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2542 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2543 BRW_REGISTER_TYPE_UD),
2544 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2545 brw_imm_ud(0xff00));
2546 brw_pop_insn_state(p);
2547 }
2548
2549 insn = next_insn(p, BRW_OPCODE_SEND);
2550
2551 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2552
2553 brw_set_dest(p, insn, dest);
2554 brw_set_src0(p, insn, src0);
2555 brw_set_src1(p, insn, brw_imm_d(0));
2556
2557 if (devinfo->gen < 6)
2558 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2559
2560 brw_set_urb_message(p,
2561 insn,
2562 flags,
2563 msg_length,
2564 response_length,
2565 offset,
2566 swizzle);
2567 }
2568
2569 void
2570 brw_send_indirect_message(struct brw_codegen *p,
2571 unsigned sfid,
2572 struct brw_reg dst,
2573 struct brw_reg payload,
2574 struct brw_reg desc,
2575 unsigned desc_imm,
2576 bool eot)
2577 {
2578 const struct gen_device_info *devinfo = p->devinfo;
2579 struct brw_inst *send;
2580
2581 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2582
2583 assert(desc.type == BRW_REGISTER_TYPE_UD);
2584
2585 if (desc.file == BRW_IMMEDIATE_VALUE) {
2586 send = next_insn(p, BRW_OPCODE_SEND);
2587 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2588 brw_set_desc(p, send, desc.ud | desc_imm);
2589 } else {
2590 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2591
2592 brw_push_insn_state(p);
2593 brw_set_default_access_mode(p, BRW_ALIGN_1);
2594 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2595 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2596 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2597
2598 /* Load the indirect descriptor to an address register using OR so the
2599 * caller can specify additional descriptor bits with the desc_imm
2600 * immediate.
2601 */
2602 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2603
2604 brw_pop_insn_state(p);
2605
2606 send = next_insn(p, BRW_OPCODE_SEND);
2607 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2608
2609 if (devinfo->gen >= 12)
2610 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2611 else
2612 brw_set_src1(p, send, addr);
2613 }
2614
2615 brw_set_dest(p, send, dst);
2616 brw_inst_set_sfid(devinfo, send, sfid);
2617 brw_inst_set_eot(devinfo, send, eot);
2618 }
2619
2620 void
2621 brw_send_indirect_split_message(struct brw_codegen *p,
2622 unsigned sfid,
2623 struct brw_reg dst,
2624 struct brw_reg payload0,
2625 struct brw_reg payload1,
2626 struct brw_reg desc,
2627 unsigned desc_imm,
2628 struct brw_reg ex_desc,
2629 unsigned ex_desc_imm,
2630 bool eot)
2631 {
2632 const struct gen_device_info *devinfo = p->devinfo;
2633 struct brw_inst *send;
2634
2635 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2636
2637 assert(desc.type == BRW_REGISTER_TYPE_UD);
2638
2639 if (desc.file == BRW_IMMEDIATE_VALUE) {
2640 desc.ud |= desc_imm;
2641 } else {
2642 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2643
2644 brw_push_insn_state(p);
2645 brw_set_default_access_mode(p, BRW_ALIGN_1);
2646 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2647 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2648 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2649
2650 /* Load the indirect descriptor to an address register using OR so the
2651 * caller can specify additional descriptor bits with the desc_imm
2652 * immediate.
2653 */
2654 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2655
2656 brw_pop_insn_state(p);
2657 desc = addr;
2658 }
2659
2660 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2661 (ex_desc.ud & INTEL_MASK(15, 12)) == 0) {
2662 ex_desc.ud |= ex_desc_imm;
2663 } else {
2664 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2665
2666 brw_push_insn_state(p);
2667 brw_set_default_access_mode(p, BRW_ALIGN_1);
2668 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2669 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2670 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2671
2672 /* Load the indirect extended descriptor to an address register using OR
2673 * so the caller can specify additional descriptor bits with the
2674 * desc_imm immediate.
2675 *
2676 * Even though the instruction dispatcher always pulls the SFID and EOT
2677 * fields from the instruction itself, actual external unit which
2678 * processes the message gets the SFID and EOT from the extended
2679 * descriptor which comes from the address register. If we don't OR
2680 * those two bits in, the external unit may get confused and hang.
2681 */
2682 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2683
2684 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2685 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2686 * we may have fallen back to an indirect extended descriptor.
2687 */
2688 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2689 } else {
2690 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2691 }
2692
2693 brw_pop_insn_state(p);
2694 ex_desc = addr;
2695 }
2696
2697 send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2698 brw_set_dest(p, send, dst);
2699 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2700 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2701
2702 if (desc.file == BRW_IMMEDIATE_VALUE) {
2703 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2704 brw_inst_set_send_desc(devinfo, send, desc.ud);
2705 } else {
2706 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2707 assert(desc.nr == BRW_ARF_ADDRESS);
2708 assert(desc.subnr == 0);
2709 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2710 }
2711
2712 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2713 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2714 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2715 } else {
2716 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2717 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2718 assert((ex_desc.subnr & 0x3) == 0);
2719 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2720 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2721 }
2722
2723 brw_inst_set_sfid(devinfo, send, sfid);
2724 brw_inst_set_eot(devinfo, send, eot);
2725 }
2726
2727 static void
2728 brw_send_indirect_surface_message(struct brw_codegen *p,
2729 unsigned sfid,
2730 struct brw_reg dst,
2731 struct brw_reg payload,
2732 struct brw_reg surface,
2733 unsigned desc_imm)
2734 {
2735 if (surface.file != BRW_IMMEDIATE_VALUE) {
2736 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2737
2738 brw_push_insn_state(p);
2739 brw_set_default_access_mode(p, BRW_ALIGN_1);
2740 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2741 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2742 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2743
2744 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2745 * some surface array is accessed out of bounds.
2746 */
2747 brw_AND(p, addr,
2748 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2749 BRW_GET_SWZ(surface.swizzle, 0)),
2750 brw_imm_ud(0xff));
2751
2752 brw_pop_insn_state(p);
2753
2754 surface = addr;
2755 }
2756
2757 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2758 }
2759
2760 static bool
2761 while_jumps_before_offset(const struct gen_device_info *devinfo,
2762 brw_inst *insn, int while_offset, int start_offset)
2763 {
2764 int scale = 16 / brw_jump_scale(devinfo);
2765 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2766 : brw_inst_jip(devinfo, insn);
2767 assert(jip < 0);
2768 return while_offset + jip * scale <= start_offset;
2769 }
2770
2771
2772 static int
2773 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2774 {
2775 int offset;
2776 void *store = p->store;
2777 const struct gen_device_info *devinfo = p->devinfo;
2778
2779 int depth = 0;
2780
2781 for (offset = next_offset(devinfo, store, start_offset);
2782 offset < p->next_insn_offset;
2783 offset = next_offset(devinfo, store, offset)) {
2784 brw_inst *insn = store + offset;
2785
2786 switch (brw_inst_opcode(devinfo, insn)) {
2787 case BRW_OPCODE_IF:
2788 depth++;
2789 break;
2790 case BRW_OPCODE_ENDIF:
2791 if (depth == 0)
2792 return offset;
2793 depth--;
2794 break;
2795 case BRW_OPCODE_WHILE:
2796 /* If the while doesn't jump before our instruction, it's the end
2797 * of a sibling do...while loop. Ignore it.
2798 */
2799 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2800 continue;
2801 /* fallthrough */
2802 case BRW_OPCODE_ELSE:
2803 case BRW_OPCODE_HALT:
2804 if (depth == 0)
2805 return offset;
2806 default:
2807 break;
2808 }
2809 }
2810
2811 return 0;
2812 }
2813
2814 /* There is no DO instruction on gen6, so to find the end of the loop
2815 * we have to see if the loop is jumping back before our start
2816 * instruction.
2817 */
2818 static int
2819 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2820 {
2821 const struct gen_device_info *devinfo = p->devinfo;
2822 int offset;
2823 void *store = p->store;
2824
2825 assert(devinfo->gen >= 6);
2826
2827 /* Always start after the instruction (such as a WHILE) we're trying to fix
2828 * up.
2829 */
2830 for (offset = next_offset(devinfo, store, start_offset);
2831 offset < p->next_insn_offset;
2832 offset = next_offset(devinfo, store, offset)) {
2833 brw_inst *insn = store + offset;
2834
2835 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2836 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2837 return offset;
2838 }
2839 }
2840 assert(!"not reached");
2841 return start_offset;
2842 }
2843
2844 /* After program generation, go back and update the UIP and JIP of
2845 * BREAK, CONT, and HALT instructions to their correct locations.
2846 */
2847 void
2848 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2849 {
2850 const struct gen_device_info *devinfo = p->devinfo;
2851 int offset;
2852 int br = brw_jump_scale(devinfo);
2853 int scale = 16 / br;
2854 void *store = p->store;
2855
2856 if (devinfo->gen < 6)
2857 return;
2858
2859 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2860 brw_inst *insn = store + offset;
2861 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2862
2863 int block_end_offset = brw_find_next_block_end(p, offset);
2864 switch (brw_inst_opcode(devinfo, insn)) {
2865 case BRW_OPCODE_BREAK:
2866 assert(block_end_offset != 0);
2867 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2868 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2869 brw_inst_set_uip(devinfo, insn,
2870 (brw_find_loop_end(p, offset) - offset +
2871 (devinfo->gen == 6 ? 16 : 0)) / scale);
2872 break;
2873 case BRW_OPCODE_CONTINUE:
2874 assert(block_end_offset != 0);
2875 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2876 brw_inst_set_uip(devinfo, insn,
2877 (brw_find_loop_end(p, offset) - offset) / scale);
2878
2879 assert(brw_inst_uip(devinfo, insn) != 0);
2880 assert(brw_inst_jip(devinfo, insn) != 0);
2881 break;
2882
2883 case BRW_OPCODE_ENDIF: {
2884 int32_t jump = (block_end_offset == 0) ?
2885 1 * br : (block_end_offset - offset) / scale;
2886 if (devinfo->gen >= 7)
2887 brw_inst_set_jip(devinfo, insn, jump);
2888 else
2889 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2890 break;
2891 }
2892
2893 case BRW_OPCODE_HALT:
2894 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2895 *
2896 * "In case of the halt instruction not inside any conditional
2897 * code block, the value of <JIP> and <UIP> should be the
2898 * same. In case of the halt instruction inside conditional code
2899 * block, the <UIP> should be the end of the program, and the
2900 * <JIP> should be end of the most inner conditional code block."
2901 *
2902 * The uip will have already been set by whoever set up the
2903 * instruction.
2904 */
2905 if (block_end_offset == 0) {
2906 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2907 } else {
2908 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2909 }
2910 assert(brw_inst_uip(devinfo, insn) != 0);
2911 assert(brw_inst_jip(devinfo, insn) != 0);
2912 break;
2913
2914 default:
2915 break;
2916 }
2917 }
2918 }
2919
2920 void brw_ff_sync(struct brw_codegen *p,
2921 struct brw_reg dest,
2922 unsigned msg_reg_nr,
2923 struct brw_reg src0,
2924 bool allocate,
2925 unsigned response_length,
2926 bool eot)
2927 {
2928 const struct gen_device_info *devinfo = p->devinfo;
2929 brw_inst *insn;
2930
2931 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2932
2933 insn = next_insn(p, BRW_OPCODE_SEND);
2934 brw_set_dest(p, insn, dest);
2935 brw_set_src0(p, insn, src0);
2936 brw_set_src1(p, insn, brw_imm_d(0));
2937
2938 if (devinfo->gen < 6)
2939 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2940
2941 brw_set_ff_sync_message(p,
2942 insn,
2943 allocate,
2944 response_length,
2945 eot);
2946 }
2947
2948 /**
2949 * Emit the SEND instruction necessary to generate stream output data on Gen6
2950 * (for transform feedback).
2951 *
2952 * If send_commit_msg is true, this is the last piece of stream output data
2953 * from this thread, so send the data as a committed write. According to the
2954 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2955 *
2956 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2957 * writes are complete by sending the final write as a committed write."
2958 */
2959 void
2960 brw_svb_write(struct brw_codegen *p,
2961 struct brw_reg dest,
2962 unsigned msg_reg_nr,
2963 struct brw_reg src0,
2964 unsigned binding_table_index,
2965 bool send_commit_msg)
2966 {
2967 const struct gen_device_info *devinfo = p->devinfo;
2968 const unsigned target_cache =
2969 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2970 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2971 BRW_SFID_DATAPORT_WRITE);
2972 brw_inst *insn;
2973
2974 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2975
2976 insn = next_insn(p, BRW_OPCODE_SEND);
2977 brw_inst_set_sfid(devinfo, insn, target_cache);
2978 brw_set_dest(p, insn, dest);
2979 brw_set_src0(p, insn, src0);
2980 brw_set_desc(p, insn,
2981 brw_message_desc(devinfo, 1, send_commit_msg, true) |
2982 brw_dp_write_desc(devinfo, binding_table_index,
2983 0, /* msg_control: ignored */
2984 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2985 0, /* last_render_target: ignored */
2986 send_commit_msg)); /* send_commit_msg */
2987 }
2988
2989 static unsigned
2990 brw_surface_payload_size(struct brw_codegen *p,
2991 unsigned num_channels,
2992 unsigned exec_size /**< 0 for SIMD4x2 */)
2993 {
2994 if (exec_size == 0)
2995 return 1; /* SIMD4x2 */
2996 else if (exec_size <= 8)
2997 return num_channels;
2998 else
2999 return 2 * num_channels;
3000 }
3001
3002 void
3003 brw_untyped_atomic(struct brw_codegen *p,
3004 struct brw_reg dst,
3005 struct brw_reg payload,
3006 struct brw_reg surface,
3007 unsigned atomic_op,
3008 unsigned msg_length,
3009 bool response_expected,
3010 bool header_present)
3011 {
3012 const struct gen_device_info *devinfo = p->devinfo;
3013 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3014 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3015 GEN7_SFID_DATAPORT_DATA_CACHE);
3016 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3017 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3018 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3019 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3020 has_simd4x2 ? 0 : 8;
3021 const unsigned response_length =
3022 brw_surface_payload_size(p, response_expected, exec_size);
3023 const unsigned desc =
3024 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3025 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3026 response_expected);
3027 /* Mask out unused components -- This is especially important in Align16
3028 * mode on generations that don't have native support for SIMD4x2 atomics,
3029 * because unused but enabled components will cause the dataport to perform
3030 * additional atomic operations on the addresses that happen to be in the
3031 * uninitialized Y, Z and W coordinates of the payload.
3032 */
3033 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3034
3035 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3036 payload, surface, desc);
3037 }
3038
3039 void
3040 brw_untyped_surface_read(struct brw_codegen *p,
3041 struct brw_reg dst,
3042 struct brw_reg payload,
3043 struct brw_reg surface,
3044 unsigned msg_length,
3045 unsigned num_channels)
3046 {
3047 const struct gen_device_info *devinfo = p->devinfo;
3048 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3049 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3050 GEN7_SFID_DATAPORT_DATA_CACHE);
3051 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3052 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3053 const unsigned response_length =
3054 brw_surface_payload_size(p, num_channels, exec_size);
3055 const unsigned desc =
3056 brw_message_desc(devinfo, msg_length, response_length, false) |
3057 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3058
3059 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3060 }
3061
3062 void
3063 brw_untyped_surface_write(struct brw_codegen *p,
3064 struct brw_reg payload,
3065 struct brw_reg surface,
3066 unsigned msg_length,
3067 unsigned num_channels,
3068 bool header_present)
3069 {
3070 const struct gen_device_info *devinfo = p->devinfo;
3071 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3072 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3073 GEN7_SFID_DATAPORT_DATA_CACHE);
3074 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3075 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3076 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3077 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3078 has_simd4x2 ? 0 : 8;
3079 const unsigned desc =
3080 brw_message_desc(devinfo, msg_length, 0, header_present) |
3081 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3082 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3083 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3084
3085 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3086 payload, surface, desc);
3087 }
3088
3089 static void
3090 brw_set_memory_fence_message(struct brw_codegen *p,
3091 struct brw_inst *insn,
3092 enum brw_message_target sfid,
3093 bool commit_enable,
3094 unsigned bti)
3095 {
3096 const struct gen_device_info *devinfo = p->devinfo;
3097
3098 brw_set_desc(p, insn, brw_message_desc(
3099 devinfo, 1, (commit_enable ? 1 : 0), true));
3100
3101 brw_inst_set_sfid(devinfo, insn, sfid);
3102
3103 switch (sfid) {
3104 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3105 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3106 break;
3107 case GEN7_SFID_DATAPORT_DATA_CACHE:
3108 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3109 break;
3110 default:
3111 unreachable("Not reached");
3112 }
3113
3114 if (commit_enable)
3115 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3116
3117 assert(devinfo->gen >= 11 || bti == 0);
3118 brw_inst_set_binding_table_index(devinfo, insn, bti);
3119 }
3120
3121 void
3122 brw_memory_fence(struct brw_codegen *p,
3123 struct brw_reg dst,
3124 struct brw_reg src,
3125 enum opcode send_op,
3126 bool stall,
3127 unsigned bti)
3128 {
3129 const struct gen_device_info *devinfo = p->devinfo;
3130 const bool commit_enable = stall ||
3131 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3132 (devinfo->gen == 7 && !devinfo->is_haswell);
3133 struct brw_inst *insn;
3134
3135 brw_push_insn_state(p);
3136 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3137 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3138 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3139 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3140
3141 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3142 * message doesn't write anything back.
3143 */
3144 insn = next_insn(p, send_op);
3145 brw_set_dest(p, insn, dst);
3146 brw_set_src0(p, insn, src);
3147 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3148 commit_enable, bti);
3149
3150 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3151 /* IVB does typed surface access through the render cache, so we need to
3152 * flush it too. Use a different register so both flushes can be
3153 * pipelined by the hardware.
3154 */
3155 insn = next_insn(p, send_op);
3156 brw_set_dest(p, insn, offset(dst, 1));
3157 brw_set_src0(p, insn, src);
3158 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3159 commit_enable, bti);
3160
3161 /* Now write the response of the second message into the response of the
3162 * first to trigger a pipeline stall -- This way future render and data
3163 * cache messages will be properly ordered with respect to past data and
3164 * render cache messages.
3165 */
3166 brw_MOV(p, dst, offset(dst, 1));
3167 }
3168
3169 if (stall)
3170 brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
3171
3172 brw_pop_insn_state(p);
3173 }
3174
3175 void
3176 brw_pixel_interpolator_query(struct brw_codegen *p,
3177 struct brw_reg dest,
3178 struct brw_reg mrf,
3179 bool noperspective,
3180 unsigned mode,
3181 struct brw_reg data,
3182 unsigned msg_length,
3183 unsigned response_length)
3184 {
3185 const struct gen_device_info *devinfo = p->devinfo;
3186 const uint16_t exec_size = brw_get_default_exec_size(p);
3187 const unsigned slot_group = brw_get_default_group(p) / 16;
3188 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3189 const unsigned desc =
3190 brw_message_desc(devinfo, msg_length, response_length, false) |
3191 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3192 slot_group);
3193
3194 /* brw_send_indirect_message will automatically use a direct send message
3195 * if data is actually immediate.
3196 */
3197 brw_send_indirect_message(p,
3198 GEN7_SFID_PIXEL_INTERPOLATOR,
3199 dest,
3200 mrf,
3201 vec1(data),
3202 desc,
3203 false);
3204 }
3205
3206 void
3207 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3208 struct brw_reg mask)
3209 {
3210 const struct gen_device_info *devinfo = p->devinfo;
3211 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3212 const unsigned qtr_control = brw_get_default_group(p) / 8;
3213 brw_inst *inst;
3214
3215 assert(devinfo->gen >= 7);
3216 assert(mask.type == BRW_REGISTER_TYPE_UD);
3217
3218 brw_push_insn_state(p);
3219
3220 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3221 * unnecessary bits in the instruction words, get the information we need
3222 * and reset the default flag register. This allows more instructions to be
3223 * compacted.
3224 */
3225 const unsigned flag_subreg = p->current->flag_subreg;
3226 brw_set_default_flag_reg(p, 0, 0);
3227
3228 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3229 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3230
3231 if (devinfo->gen >= 8) {
3232 /* Getting the first active channel index is easy on Gen8: Just find
3233 * the first bit set in the execution mask. The register exists on
3234 * HSW already but it reads back as all ones when the current
3235 * instruction has execution masking disabled, so it's kind of
3236 * useless.
3237 */
3238 struct brw_reg exec_mask =
3239 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3240
3241 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3242 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3243 /* Unfortunately, ce0 does not take into account the thread
3244 * dispatch mask, which may be a problem in cases where it's not
3245 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3246 * some n). Combine ce0 with the given dispatch (or vector) mask
3247 * to mask off those channels which were never dispatched by the
3248 * hardware.
3249 */
3250 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3251 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3252 exec_mask = vec1(dst);
3253 }
3254
3255 /* Quarter control has the effect of magically shifting the value of
3256 * ce0 so you'll get the first active channel relative to the
3257 * specified quarter control as result.
3258 */
3259 inst = brw_FBL(p, vec1(dst), exec_mask);
3260 } else {
3261 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3262
3263 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3264 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3265
3266 /* Run enough instructions returning zero with execution masking and
3267 * a conditional modifier enabled in order to get the full execution
3268 * mask in f1.0. We could use a single 32-wide move here if it
3269 * weren't because of the hardware bug that causes channel enables to
3270 * be applied incorrectly to the second half of 32-wide instructions
3271 * on Gen7.
3272 */
3273 const unsigned lower_size = MIN2(16, exec_size);
3274 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3275 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3276 brw_imm_uw(0));
3277 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3278 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3279 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3280 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3281 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3282 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3283 }
3284
3285 /* Find the first bit set in the exec_size-wide portion of the flag
3286 * register that was updated by the last sequence of MOV
3287 * instructions.
3288 */
3289 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3290 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3291 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3292 }
3293 } else {
3294 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3295
3296 if (devinfo->gen >= 8 &&
3297 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3298 /* In SIMD4x2 mode the first active channel index is just the
3299 * negation of the first bit of the mask register. Note that ce0
3300 * doesn't take into account the dispatch mask, so the Gen7 path
3301 * should be used instead unless you have the guarantee that the
3302 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3303 * for some n).
3304 */
3305 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3306 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3307 brw_imm_ud(1));
3308
3309 } else {
3310 /* Overwrite the destination without and with execution masking to
3311 * find out which of the channels is active.
3312 */
3313 brw_push_insn_state(p);
3314 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3315 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3316 brw_imm_ud(1));
3317
3318 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3319 brw_imm_ud(0));
3320 brw_pop_insn_state(p);
3321 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3322 }
3323 }
3324
3325 brw_pop_insn_state(p);
3326 }
3327
3328 void
3329 brw_broadcast(struct brw_codegen *p,
3330 struct brw_reg dst,
3331 struct brw_reg src,
3332 struct brw_reg idx)
3333 {
3334 const struct gen_device_info *devinfo = p->devinfo;
3335 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3336 brw_inst *inst;
3337
3338 brw_push_insn_state(p);
3339 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3340 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3341
3342 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3343 src.address_mode == BRW_ADDRESS_DIRECT);
3344 assert(!src.abs && !src.negate);
3345 assert(src.type == dst.type);
3346
3347 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3348 idx.file == BRW_IMMEDIATE_VALUE) {
3349 /* Trivial, the source is already uniform or the index is a constant.
3350 * We will typically not get here if the optimizer is doing its job, but
3351 * asserting would be mean.
3352 */
3353 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3354 brw_MOV(p, dst,
3355 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3356 stride(suboffset(src, 4 * i), 0, 4, 1)));
3357 } else {
3358 /* From the Haswell PRM section "Register Region Restrictions":
3359 *
3360 * "The lower bits of the AddressImmediate must not overflow to
3361 * change the register address. The lower 5 bits of Address
3362 * Immediate when added to lower 5 bits of address register gives
3363 * the sub-register offset. The upper bits of Address Immediate
3364 * when added to upper bits of address register gives the register
3365 * address. Any overflow from sub-register offset is dropped."
3366 *
3367 * Fortunately, for broadcast, we never have a sub-register offset so
3368 * this isn't an issue.
3369 */
3370 assert(src.subnr == 0);
3371
3372 if (align1) {
3373 const struct brw_reg addr =
3374 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3375 unsigned offset = src.nr * REG_SIZE + src.subnr;
3376 /* Limit in bytes of the signed indirect addressing immediate. */
3377 const unsigned limit = 512;
3378
3379 brw_push_insn_state(p);
3380 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3381 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3382
3383 /* Take into account the component size and horizontal stride. */
3384 assert(src.vstride == src.hstride + src.width);
3385 brw_SHL(p, addr, vec1(idx),
3386 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3387 src.hstride - 1));
3388
3389 /* We can only address up to limit bytes using the indirect
3390 * addressing immediate, account for the difference if the source
3391 * register is above this limit.
3392 */
3393 if (offset >= limit) {
3394 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3395 offset = offset % limit;
3396 }
3397
3398 brw_pop_insn_state(p);
3399
3400 /* Use indirect addressing to fetch the specified component. */
3401 if (type_sz(src.type) > 4 &&
3402 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3403 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3404 *
3405 * "When source or destination datatype is 64b or operation is
3406 * integer DWord multiply, indirect addressing must not be
3407 * used."
3408 *
3409 * To work around both of this issue, we do two integer MOVs
3410 * insead of one 64-bit MOV. Because no double value should ever
3411 * cross a register boundary, it's safe to use the immediate
3412 * offset in the indirect here to handle adding 4 bytes to the
3413 * offset and avoid the extra ADD to the register file.
3414 */
3415 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3416 retype(brw_vec1_indirect(addr.subnr, offset),
3417 BRW_REGISTER_TYPE_D));
3418 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3419 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3420 BRW_REGISTER_TYPE_D));
3421 } else {
3422 brw_MOV(p, dst,
3423 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3424 }
3425 } else {
3426 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3427 * to all bits of a flag register,
3428 */
3429 inst = brw_MOV(p,
3430 brw_null_reg(),
3431 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3432 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3433 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3434 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3435
3436 /* and use predicated SEL to pick the right channel. */
3437 inst = brw_SEL(p, dst,
3438 stride(suboffset(src, 4), 4, 4, 1),
3439 stride(src, 4, 4, 1));
3440 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3441 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3442 }
3443 }
3444
3445 brw_pop_insn_state(p);
3446 }
3447
3448 /**
3449 * This instruction is generated as a single-channel align1 instruction by
3450 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3451 *
3452 * We can't use the typed atomic op in the FS because that has the execution
3453 * mask ANDed with the pixel mask, but we just want to write the one dword for
3454 * all the pixels.
3455 *
3456 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3457 * one u32. So we use the same untyped atomic write message as the pixel
3458 * shader.
3459 *
3460 * The untyped atomic operation requires a BUFFER surface type with RAW
3461 * format, and is only accessible through the legacy DATA_CACHE dataport
3462 * messages.
3463 */
3464 void brw_shader_time_add(struct brw_codegen *p,
3465 struct brw_reg payload,
3466 uint32_t surf_index)
3467 {
3468 const struct gen_device_info *devinfo = p->devinfo;
3469 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3470 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3471 GEN7_SFID_DATAPORT_DATA_CACHE);
3472 assert(devinfo->gen >= 7);
3473
3474 brw_push_insn_state(p);
3475 brw_set_default_access_mode(p, BRW_ALIGN_1);
3476 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3477 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3478 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3479
3480 /* We use brw_vec1_reg and unmasked because we want to increment the given
3481 * offset only once.
3482 */
3483 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3484 BRW_ARF_NULL, 0));
3485 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3486 payload.nr, 0));
3487 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3488 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3489 false)));
3490
3491 brw_inst_set_sfid(devinfo, send, sfid);
3492 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3493
3494 brw_pop_insn_state(p);
3495 }
3496
3497
3498 /**
3499 * Emit the SEND message for a barrier
3500 */
3501 void
3502 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3503 {
3504 const struct gen_device_info *devinfo = p->devinfo;
3505 struct brw_inst *inst;
3506
3507 assert(devinfo->gen >= 7);
3508
3509 brw_push_insn_state(p);
3510 brw_set_default_access_mode(p, BRW_ALIGN_1);
3511 inst = next_insn(p, BRW_OPCODE_SEND);
3512 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3513 brw_set_src0(p, inst, src);
3514 brw_set_src1(p, inst, brw_null_reg());
3515 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3516
3517 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3518 brw_inst_set_gateway_notify(devinfo, inst, 1);
3519 brw_inst_set_gateway_subfuncid(devinfo, inst,
3520 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3521
3522 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3523 brw_pop_insn_state(p);
3524 }
3525
3526
3527 /**
3528 * Emit the wait instruction for a barrier
3529 */
3530 void
3531 brw_WAIT(struct brw_codegen *p)
3532 {
3533 const struct gen_device_info *devinfo = p->devinfo;
3534 struct brw_inst *insn;
3535
3536 struct brw_reg src = brw_notification_reg();
3537
3538 insn = next_insn(p, BRW_OPCODE_WAIT);
3539 brw_set_dest(p, insn, src);
3540 brw_set_src0(p, insn, src);
3541 brw_set_src1(p, insn, brw_null_reg());
3542
3543 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3544 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3545 }
3546
3547 void
3548 brw_float_controls_mode(struct brw_codegen *p,
3549 unsigned mode, unsigned mask)
3550 {
3551 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3552 brw_imm_ud(~mask));
3553 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3554
3555 /* From the Skylake PRM, Volume 7, page 760:
3556 * "Implementation Restriction on Register Access: When the control
3557 * register is used as an explicit source and/or destination, hardware
3558 * does not ensure execution pipeline coherency. Software must set the
3559 * thread control field to ‘switch’ for an instruction that uses
3560 * control register as an explicit operand."
3561 */
3562 if (p->devinfo->gen < 12)
3563 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3564
3565 if (mode) {
3566 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3567 brw_imm_ud(mode));
3568 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3569 if (p->devinfo->gen < 12)
3570 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3571 }
3572 }