intel/compiler: Add Immediate support for 3 source instruction
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 assert(devinfo->gen < 12);
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct gen_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 void
89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91 const struct gen_device_info *devinfo = p->devinfo;
92
93 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
95 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96 assert(dest.nr < 128);
97
98 /* The hardware has a restriction where if the destination is Byte,
99 * the instruction needs to have a stride of 2 (except for packed byte
100 * MOV). This seems to be required even if the destination is the NULL
101 * register.
102 */
103 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104 dest.nr == BRW_ARF_NULL &&
105 type_sz(dest.type) == 1) {
106 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
107 }
108
109 gen7_convert_mrf_to_grf(p, &dest);
110
111 if (devinfo->gen >= 12 &&
112 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
113 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
114 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
115 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
116 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
117 assert(dest.subnr == 0);
118 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
119 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
120 dest.vstride == dest.width + 1));
121 assert(!dest.negate && !dest.abs);
122 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
123 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
124
125 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
126 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
127 assert(devinfo->gen < 12);
128 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
129 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
130 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
131 assert(dest.subnr % 16 == 0);
132 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
133 dest.vstride == dest.width + 1);
134 assert(!dest.negate && !dest.abs);
135 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
136 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
137 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
138 } else {
139 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
140 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
141
142 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
143 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
144
145 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
146 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
147 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
148 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
149 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
150 } else {
151 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
152 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
153 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
154 dest.file == BRW_MESSAGE_REGISTER_FILE) {
155 assert(dest.writemask != 0);
156 }
157 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
158 * Although Dst.HorzStride is a don't care for Align16, HW needs
159 * this to be programmed as "01".
160 */
161 brw_inst_set_dst_hstride(devinfo, inst, 1);
162 }
163 } else {
164 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
165
166 /* These are different sizes in align1 vs align16:
167 */
168 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
169 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
170 dest.indirect_offset);
171 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
172 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
173 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
174 } else {
175 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
176 dest.indirect_offset);
177 /* even ignored in da16, still need to set as '01' */
178 brw_inst_set_dst_hstride(devinfo, inst, 1);
179 }
180 }
181 }
182
183 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
184 * or 16 (SIMD16), as that's normally correct. However, when dealing with
185 * small registers, it can be useful for us to automatically reduce it to
186 * match the register size.
187 */
188 if (p->automatic_exec_sizes) {
189 /*
190 * In platforms that support fp64 we can emit instructions with a width
191 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
192 * these cases we need to make sure that these instructions have their
193 * exec sizes set properly when they are emitted and we can't rely on
194 * this code to fix it.
195 */
196 bool fix_exec_size;
197 if (devinfo->gen >= 6)
198 fix_exec_size = dest.width < BRW_EXECUTE_4;
199 else
200 fix_exec_size = dest.width < BRW_EXECUTE_8;
201
202 if (fix_exec_size)
203 brw_inst_set_exec_size(devinfo, inst, dest.width);
204 }
205 }
206
207 void
208 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
209 {
210 const struct gen_device_info *devinfo = p->devinfo;
211
212 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
213 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
214 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
215 assert(reg.nr < 128);
216
217 gen7_convert_mrf_to_grf(p, &reg);
218
219 if (devinfo->gen >= 6 &&
220 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
221 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
223 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
224 /* Any source modifiers or regions will be ignored, since this just
225 * identifies the MRF/GRF to start reading the message contents from.
226 * Check for some likely failures.
227 */
228 assert(!reg.negate);
229 assert(!reg.abs);
230 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
231 }
232
233 if (devinfo->gen >= 12 &&
234 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
235 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
236 assert(reg.file != BRW_IMMEDIATE_VALUE);
237 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
238 assert(reg.subnr == 0);
239 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
240 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
241 reg.vstride == reg.width + 1));
242 assert(!reg.negate && !reg.abs);
243 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
244 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
245
246 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
247 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
248 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
249 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
250 assert(reg.subnr % 16 == 0);
251 assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
252 reg.vstride == reg.width + 1);
253 assert(!reg.negate && !reg.abs);
254 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
255 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
256 } else {
257 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
258 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
259 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
260 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
261
262 if (reg.file == BRW_IMMEDIATE_VALUE) {
263 if (reg.type == BRW_REGISTER_TYPE_DF ||
264 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
265 brw_inst_set_imm_df(devinfo, inst, reg.df);
266 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
267 reg.type == BRW_REGISTER_TYPE_Q)
268 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
269 else
270 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
271
272 if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
273 brw_inst_set_src1_reg_file(devinfo, inst,
274 BRW_ARCHITECTURE_REGISTER_FILE);
275 brw_inst_set_src1_reg_hw_type(devinfo, inst,
276 brw_inst_src0_reg_hw_type(devinfo, inst));
277 }
278 } else {
279 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
280 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
281 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
282 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
283 } else {
284 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
285 }
286 } else {
287 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
288
289 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
290 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
291 } else {
292 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
293 }
294 }
295
296 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
297 if (reg.width == BRW_WIDTH_1 &&
298 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
299 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
300 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
301 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
302 } else {
303 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
304 brw_inst_set_src0_width(devinfo, inst, reg.width);
305 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
306 }
307 } else {
308 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
309 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
310 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
311 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
312 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
313 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
314 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
315 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
316
317 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
318 /* This is an oddity of the fact we're using the same
319 * descriptions for registers in align_16 as align_1:
320 */
321 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
322 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
323 reg.type == BRW_REGISTER_TYPE_DF &&
324 reg.vstride == BRW_VERTICAL_STRIDE_2) {
325 /* From SNB PRM:
326 *
327 * "For Align16 access mode, only encodings of 0000 and 0011
328 * are allowed. Other codes are reserved."
329 *
330 * Presumably the DevSNB behavior applies to IVB as well.
331 */
332 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
333 } else {
334 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
335 }
336 }
337 }
338 }
339 }
340
341
342 void
343 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
344 {
345 const struct gen_device_info *devinfo = p->devinfo;
346
347 if (reg.file == BRW_GENERAL_REGISTER_FILE)
348 assert(reg.nr < 128);
349
350 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
351 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
352 (devinfo->gen >= 12 &&
353 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
354 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
355 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
356 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
357 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
358 assert(reg.subnr == 0);
359 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
360 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
361 reg.vstride == reg.width + 1));
362 assert(!reg.negate && !reg.abs);
363 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
364 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
365 } else {
366 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
367 *
368 * "Accumulator registers may be accessed explicitly as src0
369 * operands only."
370 */
371 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
372 reg.nr != BRW_ARF_ACCUMULATOR);
373
374 gen7_convert_mrf_to_grf(p, &reg);
375 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
376
377 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
378 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
379 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
380
381 /* Only src1 can be immediate in two-argument instructions.
382 */
383 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
384
385 if (reg.file == BRW_IMMEDIATE_VALUE) {
386 /* two-argument instructions can only use 32-bit immediates */
387 assert(type_sz(reg.type) < 8);
388 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
389 } else {
390 /* This is a hardware restriction, which may or may not be lifted
391 * in the future:
392 */
393 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
394 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
395
396 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
397 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
398 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
399 } else {
400 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
401 }
402
403 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
404 if (reg.width == BRW_WIDTH_1 &&
405 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
406 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
407 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
408 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
409 } else {
410 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
411 brw_inst_set_src1_width(devinfo, inst, reg.width);
412 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
413 }
414 } else {
415 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
416 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
417 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
418 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
419 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
420 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
421 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
422 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
423
424 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
425 /* This is an oddity of the fact we're using the same
426 * descriptions for registers in align_16 as align_1:
427 */
428 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
429 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
430 reg.type == BRW_REGISTER_TYPE_DF &&
431 reg.vstride == BRW_VERTICAL_STRIDE_2) {
432 /* From SNB PRM:
433 *
434 * "For Align16 access mode, only encodings of 0000 and 0011
435 * are allowed. Other codes are reserved."
436 *
437 * Presumably the DevSNB behavior applies to IVB as well.
438 */
439 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
440 } else {
441 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
442 }
443 }
444 }
445 }
446 }
447
448 /**
449 * Specify the descriptor and extended descriptor immediate for a SEND(C)
450 * message instruction.
451 */
452 void
453 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
454 unsigned desc, unsigned ex_desc)
455 {
456 const struct gen_device_info *devinfo = p->devinfo;
457 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
458 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
459 if (devinfo->gen < 12)
460 brw_inst_set_src1_file_type(devinfo, inst,
461 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
462 brw_inst_set_send_desc(devinfo, inst, desc);
463 if (devinfo->gen >= 9)
464 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
465 }
466
467 static void brw_set_math_message( struct brw_codegen *p,
468 brw_inst *inst,
469 unsigned function,
470 unsigned integer_type,
471 bool low_precision,
472 unsigned dataType )
473 {
474 const struct gen_device_info *devinfo = p->devinfo;
475 unsigned msg_length;
476 unsigned response_length;
477
478 /* Infer message length from the function */
479 switch (function) {
480 case BRW_MATH_FUNCTION_POW:
481 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
482 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
483 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
484 msg_length = 2;
485 break;
486 default:
487 msg_length = 1;
488 break;
489 }
490
491 /* Infer response length from the function */
492 switch (function) {
493 case BRW_MATH_FUNCTION_SINCOS:
494 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
495 response_length = 2;
496 break;
497 default:
498 response_length = 1;
499 break;
500 }
501
502 brw_set_desc(p, inst, brw_message_desc(
503 devinfo, msg_length, response_length, false));
504
505 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
506 brw_inst_set_math_msg_function(devinfo, inst, function);
507 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
508 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
509 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
510 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
511 brw_inst_set_saturate(devinfo, inst, 0);
512 }
513
514
515 static void brw_set_ff_sync_message(struct brw_codegen *p,
516 brw_inst *insn,
517 bool allocate,
518 unsigned response_length,
519 bool end_of_thread)
520 {
521 const struct gen_device_info *devinfo = p->devinfo;
522
523 brw_set_desc(p, insn, brw_message_desc(
524 devinfo, 1, response_length, true));
525
526 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
527 brw_inst_set_eot(devinfo, insn, end_of_thread);
528 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
529 brw_inst_set_urb_allocate(devinfo, insn, allocate);
530 /* The following fields are not used by FF_SYNC: */
531 brw_inst_set_urb_global_offset(devinfo, insn, 0);
532 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
533 brw_inst_set_urb_used(devinfo, insn, 0);
534 brw_inst_set_urb_complete(devinfo, insn, 0);
535 }
536
537 static void brw_set_urb_message( struct brw_codegen *p,
538 brw_inst *insn,
539 enum brw_urb_write_flags flags,
540 unsigned msg_length,
541 unsigned response_length,
542 unsigned offset,
543 unsigned swizzle_control )
544 {
545 const struct gen_device_info *devinfo = p->devinfo;
546
547 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
548 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
549 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
550
551 brw_set_desc(p, insn, brw_message_desc(
552 devinfo, msg_length, response_length, true));
553
554 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
555 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
556
557 if (flags & BRW_URB_WRITE_OWORD) {
558 assert(msg_length == 2); /* header + one OWORD of data */
559 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
560 } else {
561 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
562 }
563
564 brw_inst_set_urb_global_offset(devinfo, insn, offset);
565 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
566
567 if (devinfo->gen < 8) {
568 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
569 }
570
571 if (devinfo->gen < 7) {
572 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
573 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
574 } else {
575 brw_inst_set_urb_per_slot_offset(devinfo, insn,
576 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
577 }
578 }
579
580 static void
581 gen7_set_dp_scratch_message(struct brw_codegen *p,
582 brw_inst *inst,
583 bool write,
584 bool dword,
585 bool invalidate_after_read,
586 unsigned num_regs,
587 unsigned addr_offset,
588 unsigned mlen,
589 unsigned rlen,
590 bool header_present)
591 {
592 const struct gen_device_info *devinfo = p->devinfo;
593 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
594 (devinfo->gen >= 8 && num_regs == 8));
595 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
596 num_regs - 1);
597
598 brw_set_desc(p, inst, brw_message_desc(
599 devinfo, mlen, rlen, header_present));
600
601 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
602 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
603 brw_inst_set_scratch_read_write(devinfo, inst, write);
604 brw_inst_set_scratch_type(devinfo, inst, dword);
605 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
606 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
607 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
608 }
609
610 static void
611 brw_inst_set_state(const struct gen_device_info *devinfo,
612 brw_inst *insn,
613 const struct brw_insn_state *state)
614 {
615 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
616 brw_inst_set_group(devinfo, insn, state->group);
617 brw_inst_set_compression(devinfo, insn, state->compressed);
618 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
619 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
620 if (devinfo->gen >= 12)
621 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(state->swsb));
622 brw_inst_set_saturate(devinfo, insn, state->saturate);
623 brw_inst_set_pred_control(devinfo, insn, state->predicate);
624 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
625
626 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
627 state->access_mode == BRW_ALIGN_16) {
628 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
629 if (devinfo->gen >= 7)
630 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
631 } else {
632 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
633 if (devinfo->gen >= 7)
634 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
635 }
636
637 if (devinfo->gen >= 6)
638 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
639 }
640
641 #define next_insn brw_next_insn
642 brw_inst *
643 brw_next_insn(struct brw_codegen *p, unsigned opcode)
644 {
645 const struct gen_device_info *devinfo = p->devinfo;
646 brw_inst *insn;
647
648 if (p->nr_insn + 1 > p->store_size) {
649 p->store_size <<= 1;
650 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
651 }
652
653 p->next_insn_offset += 16;
654 insn = &p->store[p->nr_insn++];
655
656 memset(insn, 0, sizeof(*insn));
657 brw_inst_set_opcode(devinfo, insn, opcode);
658
659 /* Apply the default instruction state */
660 brw_inst_set_state(devinfo, insn, p->current);
661
662 return insn;
663 }
664
665 static brw_inst *
666 brw_alu1(struct brw_codegen *p, unsigned opcode,
667 struct brw_reg dest, struct brw_reg src)
668 {
669 brw_inst *insn = next_insn(p, opcode);
670 brw_set_dest(p, insn, dest);
671 brw_set_src0(p, insn, src);
672 return insn;
673 }
674
675 static brw_inst *
676 brw_alu2(struct brw_codegen *p, unsigned opcode,
677 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
678 {
679 /* 64-bit immediates are only supported on 1-src instructions */
680 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
681 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
682
683 brw_inst *insn = next_insn(p, opcode);
684 brw_set_dest(p, insn, dest);
685 brw_set_src0(p, insn, src0);
686 brw_set_src1(p, insn, src1);
687 return insn;
688 }
689
690 static int
691 get_3src_subreg_nr(struct brw_reg reg)
692 {
693 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
694 * use 32-bit units (components 0..7). Since they only support F/D/UD
695 * types, this doesn't lose any flexibility, but uses fewer bits.
696 */
697 return reg.subnr / 4;
698 }
699
700 static enum gen10_align1_3src_vertical_stride
701 to_3src_align1_vstride(const struct gen_device_info *devinfo,
702 enum brw_vertical_stride vstride)
703 {
704 switch (vstride) {
705 case BRW_VERTICAL_STRIDE_0:
706 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
707 case BRW_VERTICAL_STRIDE_1:
708 assert(devinfo->gen >= 12);
709 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
710 case BRW_VERTICAL_STRIDE_2:
711 assert(devinfo->gen < 12);
712 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
713 case BRW_VERTICAL_STRIDE_4:
714 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
715 case BRW_VERTICAL_STRIDE_8:
716 case BRW_VERTICAL_STRIDE_16:
717 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
718 default:
719 unreachable("invalid vstride");
720 }
721 }
722
723
724 static enum gen10_align1_3src_src_horizontal_stride
725 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
726 {
727 switch (hstride) {
728 case BRW_HORIZONTAL_STRIDE_0:
729 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
730 case BRW_HORIZONTAL_STRIDE_1:
731 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
732 case BRW_HORIZONTAL_STRIDE_2:
733 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
734 case BRW_HORIZONTAL_STRIDE_4:
735 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
736 default:
737 unreachable("invalid hstride");
738 }
739 }
740
741 static brw_inst *
742 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
743 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
744 {
745 const struct gen_device_info *devinfo = p->devinfo;
746 brw_inst *inst = next_insn(p, opcode);
747
748 gen7_convert_mrf_to_grf(p, &dest);
749
750 assert(dest.nr < 128);
751
752 if (devinfo->gen >= 10)
753 assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
754 src2.file == BRW_IMMEDIATE_VALUE));
755
756 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
757 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
758 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
759 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
760 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
761 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
762 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
763
764 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
765 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
766 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
767
768 if (devinfo->gen >= 12) {
769 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
770 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
771 } else {
772 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
773 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
774 BRW_ALIGN1_3SRC_ACCUMULATOR);
775 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
776 } else {
777 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
778 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
779 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
780 }
781 }
782 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
783
784 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
785
786 if (brw_reg_type_is_floating_point(dest.type)) {
787 brw_inst_set_3src_a1_exec_type(devinfo, inst,
788 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
789 } else {
790 brw_inst_set_3src_a1_exec_type(devinfo, inst,
791 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
792 }
793
794 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
795 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
796 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
797 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
798
799 if (src0.file == BRW_IMMEDIATE_VALUE) {
800 brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
801 } else {
802 brw_inst_set_3src_a1_src0_vstride(
803 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
804 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
805 to_3src_align1_hstride(src0.hstride));
806 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
807 if (src0.type == BRW_REGISTER_TYPE_NF) {
808 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
809 } else {
810 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
811 }
812 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
813 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
814 }
815 brw_inst_set_3src_a1_src1_vstride(
816 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
817 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
818 to_3src_align1_hstride(src1.hstride));
819
820 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
821 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
822 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
823 } else {
824 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
825 }
826 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
827 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
828
829 if (src2.file == BRW_IMMEDIATE_VALUE) {
830 brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
831 } else {
832 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
833 to_3src_align1_hstride(src2.hstride));
834 /* no vstride on src2 */
835 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
836 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
837 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
838 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
839 }
840
841 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
842 src0.file == BRW_IMMEDIATE_VALUE ||
843 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
844 src0.type == BRW_REGISTER_TYPE_NF));
845 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
846 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
847 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
848 src2.file == BRW_IMMEDIATE_VALUE);
849
850 if (devinfo->gen >= 12) {
851 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
852 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
853 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
854 } else {
855 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
856 src0.file == BRW_GENERAL_REGISTER_FILE ?
857 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
858 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
859 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
860 src1.file == BRW_GENERAL_REGISTER_FILE ?
861 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
862 BRW_ALIGN1_3SRC_ACCUMULATOR);
863 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
864 src2.file == BRW_GENERAL_REGISTER_FILE ?
865 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
866 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
867 }
868
869 } else {
870 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
871 dest.file == BRW_MESSAGE_REGISTER_FILE);
872 assert(dest.type == BRW_REGISTER_TYPE_F ||
873 dest.type == BRW_REGISTER_TYPE_DF ||
874 dest.type == BRW_REGISTER_TYPE_D ||
875 dest.type == BRW_REGISTER_TYPE_UD ||
876 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
877 if (devinfo->gen == 6) {
878 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
879 dest.file == BRW_MESSAGE_REGISTER_FILE);
880 }
881 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
882 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
883 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
884
885 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
886 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
887 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
888 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
889 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
890 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
891 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
892 src0.vstride == BRW_VERTICAL_STRIDE_0);
893
894 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
895 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
896 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
897 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
898 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
899 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
900 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
901 src1.vstride == BRW_VERTICAL_STRIDE_0);
902
903 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
904 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
905 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
906 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
907 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
908 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
909 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
910 src2.vstride == BRW_VERTICAL_STRIDE_0);
911
912 if (devinfo->gen >= 7) {
913 /* Set both the source and destination types based on dest.type,
914 * ignoring the source register types. The MAD and LRP emitters ensure
915 * that all four types are float. The BFE and BFI2 emitters, however,
916 * may send us mixed D and UD types and want us to ignore that and use
917 * the destination type.
918 */
919 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
920 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
921
922 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
923 *
924 * "Three source instructions can use operands with mixed-mode
925 * precision. When SrcType field is set to :f or :hf it defines
926 * precision for source 0 only, and fields Src1Type and Src2Type
927 * define precision for other source operands:
928 *
929 * 0b = :f. Single precision Float (32-bit).
930 * 1b = :hf. Half precision Float (16-bit)."
931 */
932 if (src1.type == BRW_REGISTER_TYPE_HF)
933 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
934
935 if (src2.type == BRW_REGISTER_TYPE_HF)
936 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
937 }
938 }
939
940 return inst;
941 }
942
943
944 /***********************************************************************
945 * Convenience routines.
946 */
947 #define ALU1(OP) \
948 brw_inst *brw_##OP(struct brw_codegen *p, \
949 struct brw_reg dest, \
950 struct brw_reg src0) \
951 { \
952 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
953 }
954
955 #define ALU2(OP) \
956 brw_inst *brw_##OP(struct brw_codegen *p, \
957 struct brw_reg dest, \
958 struct brw_reg src0, \
959 struct brw_reg src1) \
960 { \
961 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
962 }
963
964 #define ALU3(OP) \
965 brw_inst *brw_##OP(struct brw_codegen *p, \
966 struct brw_reg dest, \
967 struct brw_reg src0, \
968 struct brw_reg src1, \
969 struct brw_reg src2) \
970 { \
971 if (p->current->access_mode == BRW_ALIGN_16) { \
972 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
973 src0.swizzle = BRW_SWIZZLE_XXXX; \
974 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
975 src1.swizzle = BRW_SWIZZLE_XXXX; \
976 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
977 src2.swizzle = BRW_SWIZZLE_XXXX; \
978 } \
979 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
980 }
981
982 #define ALU3F(OP) \
983 brw_inst *brw_##OP(struct brw_codegen *p, \
984 struct brw_reg dest, \
985 struct brw_reg src0, \
986 struct brw_reg src1, \
987 struct brw_reg src2) \
988 { \
989 assert(dest.type == BRW_REGISTER_TYPE_F || \
990 dest.type == BRW_REGISTER_TYPE_DF); \
991 if (dest.type == BRW_REGISTER_TYPE_F) { \
992 assert(src0.type == BRW_REGISTER_TYPE_F); \
993 assert(src1.type == BRW_REGISTER_TYPE_F); \
994 assert(src2.type == BRW_REGISTER_TYPE_F); \
995 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
996 assert(src0.type == BRW_REGISTER_TYPE_DF); \
997 assert(src1.type == BRW_REGISTER_TYPE_DF); \
998 assert(src2.type == BRW_REGISTER_TYPE_DF); \
999 } \
1000 \
1001 if (p->current->access_mode == BRW_ALIGN_16) { \
1002 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1003 src0.swizzle = BRW_SWIZZLE_XXXX; \
1004 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1005 src1.swizzle = BRW_SWIZZLE_XXXX; \
1006 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1007 src2.swizzle = BRW_SWIZZLE_XXXX; \
1008 } \
1009 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1010 }
1011
1012 /* Rounding operations (other than RNDD) require two instructions - the first
1013 * stores a rounded value (possibly the wrong way) in the dest register, but
1014 * also sets a per-channel "increment bit" in the flag register. A predicated
1015 * add of 1.0 fixes dest to contain the desired result.
1016 *
1017 * Sandybridge and later appear to round correctly without an ADD.
1018 */
1019 #define ROUND(OP) \
1020 void brw_##OP(struct brw_codegen *p, \
1021 struct brw_reg dest, \
1022 struct brw_reg src) \
1023 { \
1024 const struct gen_device_info *devinfo = p->devinfo; \
1025 brw_inst *rnd, *add; \
1026 rnd = next_insn(p, BRW_OPCODE_##OP); \
1027 brw_set_dest(p, rnd, dest); \
1028 brw_set_src0(p, rnd, src); \
1029 \
1030 if (devinfo->gen < 6) { \
1031 /* turn on round-increments */ \
1032 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1033 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1034 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1035 } \
1036 }
1037
1038
1039 ALU2(SEL)
1040 ALU1(NOT)
1041 ALU2(AND)
1042 ALU2(OR)
1043 ALU2(XOR)
1044 ALU2(SHR)
1045 ALU2(SHL)
1046 ALU1(DIM)
1047 ALU2(ASR)
1048 ALU2(ROL)
1049 ALU2(ROR)
1050 ALU3(CSEL)
1051 ALU1(FRC)
1052 ALU1(RNDD)
1053 ALU2(MAC)
1054 ALU2(MACH)
1055 ALU1(LZD)
1056 ALU2(DP4)
1057 ALU2(DPH)
1058 ALU2(DP3)
1059 ALU2(DP2)
1060 ALU3(MAD)
1061 ALU3F(LRP)
1062 ALU1(BFREV)
1063 ALU3(BFE)
1064 ALU2(BFI1)
1065 ALU3(BFI2)
1066 ALU1(FBH)
1067 ALU1(FBL)
1068 ALU1(CBIT)
1069 ALU2(ADDC)
1070 ALU2(SUBB)
1071
1072 ROUND(RNDZ)
1073 ROUND(RNDE)
1074
1075 brw_inst *
1076 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1077 {
1078 const struct gen_device_info *devinfo = p->devinfo;
1079
1080 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1081 * To avoid the problems that causes, we use an <X,2,0> source region to
1082 * read each element twice.
1083 */
1084 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1085 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1086 dest.type == BRW_REGISTER_TYPE_DF &&
1087 (src0.type == BRW_REGISTER_TYPE_F ||
1088 src0.type == BRW_REGISTER_TYPE_D ||
1089 src0.type == BRW_REGISTER_TYPE_UD) &&
1090 !has_scalar_region(src0)) {
1091 assert(src0.vstride == src0.width + src0.hstride);
1092 src0.vstride = src0.hstride;
1093 src0.width = BRW_WIDTH_2;
1094 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1095 }
1096
1097 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1098 }
1099
1100 brw_inst *
1101 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1102 struct brw_reg src0, struct brw_reg src1)
1103 {
1104 /* 6.2.2: add */
1105 if (src0.type == BRW_REGISTER_TYPE_F ||
1106 (src0.file == BRW_IMMEDIATE_VALUE &&
1107 src0.type == BRW_REGISTER_TYPE_VF)) {
1108 assert(src1.type != BRW_REGISTER_TYPE_UD);
1109 assert(src1.type != BRW_REGISTER_TYPE_D);
1110 }
1111
1112 if (src1.type == BRW_REGISTER_TYPE_F ||
1113 (src1.file == BRW_IMMEDIATE_VALUE &&
1114 src1.type == BRW_REGISTER_TYPE_VF)) {
1115 assert(src0.type != BRW_REGISTER_TYPE_UD);
1116 assert(src0.type != BRW_REGISTER_TYPE_D);
1117 }
1118
1119 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1120 }
1121
1122 brw_inst *
1123 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1124 struct brw_reg src0, struct brw_reg src1)
1125 {
1126 assert(dest.type == src0.type);
1127 assert(src0.type == src1.type);
1128 switch (src0.type) {
1129 case BRW_REGISTER_TYPE_B:
1130 case BRW_REGISTER_TYPE_UB:
1131 case BRW_REGISTER_TYPE_W:
1132 case BRW_REGISTER_TYPE_UW:
1133 case BRW_REGISTER_TYPE_D:
1134 case BRW_REGISTER_TYPE_UD:
1135 break;
1136 default:
1137 unreachable("Bad type for brw_AVG");
1138 }
1139
1140 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1141 }
1142
1143 brw_inst *
1144 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1145 struct brw_reg src0, struct brw_reg src1)
1146 {
1147 /* 6.32.38: mul */
1148 if (src0.type == BRW_REGISTER_TYPE_D ||
1149 src0.type == BRW_REGISTER_TYPE_UD ||
1150 src1.type == BRW_REGISTER_TYPE_D ||
1151 src1.type == BRW_REGISTER_TYPE_UD) {
1152 assert(dest.type != BRW_REGISTER_TYPE_F);
1153 }
1154
1155 if (src0.type == BRW_REGISTER_TYPE_F ||
1156 (src0.file == BRW_IMMEDIATE_VALUE &&
1157 src0.type == BRW_REGISTER_TYPE_VF)) {
1158 assert(src1.type != BRW_REGISTER_TYPE_UD);
1159 assert(src1.type != BRW_REGISTER_TYPE_D);
1160 }
1161
1162 if (src1.type == BRW_REGISTER_TYPE_F ||
1163 (src1.file == BRW_IMMEDIATE_VALUE &&
1164 src1.type == BRW_REGISTER_TYPE_VF)) {
1165 assert(src0.type != BRW_REGISTER_TYPE_UD);
1166 assert(src0.type != BRW_REGISTER_TYPE_D);
1167 }
1168
1169 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1170 src0.nr != BRW_ARF_ACCUMULATOR);
1171 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1172 src1.nr != BRW_ARF_ACCUMULATOR);
1173
1174 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1175 }
1176
1177 brw_inst *
1178 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1179 struct brw_reg src0, struct brw_reg src1)
1180 {
1181 src0.vstride = BRW_VERTICAL_STRIDE_0;
1182 src0.width = BRW_WIDTH_1;
1183 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1184 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1185 }
1186
1187 brw_inst *
1188 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1189 struct brw_reg src0, struct brw_reg src1)
1190 {
1191 src0.vstride = BRW_VERTICAL_STRIDE_0;
1192 src0.width = BRW_WIDTH_1;
1193 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1194 src1.vstride = BRW_VERTICAL_STRIDE_8;
1195 src1.width = BRW_WIDTH_8;
1196 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1197 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1198 }
1199
1200 brw_inst *
1201 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1202 {
1203 const struct gen_device_info *devinfo = p->devinfo;
1204 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1205 /* The F32TO16 instruction doesn't support 32-bit destination types in
1206 * Align1 mode, and neither does the Gen8 implementation in terms of a
1207 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1208 * an undocumented feature.
1209 */
1210 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1211 (!align16 || devinfo->gen >= 8));
1212 brw_inst *inst;
1213
1214 if (align16) {
1215 assert(dst.type == BRW_REGISTER_TYPE_UD);
1216 } else {
1217 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1218 dst.type == BRW_REGISTER_TYPE_W ||
1219 dst.type == BRW_REGISTER_TYPE_UW ||
1220 dst.type == BRW_REGISTER_TYPE_HF);
1221 }
1222
1223 brw_push_insn_state(p);
1224
1225 if (needs_zero_fill) {
1226 brw_set_default_access_mode(p, BRW_ALIGN_1);
1227 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1228 }
1229
1230 if (devinfo->gen >= 8) {
1231 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1232 } else {
1233 assert(devinfo->gen == 7);
1234 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1235 }
1236
1237 if (needs_zero_fill) {
1238 if (devinfo->gen < 12)
1239 brw_inst_set_no_dd_clear(devinfo, inst, true);
1240 brw_set_default_swsb(p, tgl_swsb_null());
1241 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1242 if (devinfo->gen < 12)
1243 brw_inst_set_no_dd_check(devinfo, inst, true);
1244 }
1245
1246 brw_pop_insn_state(p);
1247 return inst;
1248 }
1249
1250 brw_inst *
1251 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1252 {
1253 const struct gen_device_info *devinfo = p->devinfo;
1254 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1255
1256 if (align16) {
1257 assert(src.type == BRW_REGISTER_TYPE_UD);
1258 } else {
1259 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1260 *
1261 * Because this instruction does not have a 16-bit floating-point
1262 * type, the source data type must be Word (W). The destination type
1263 * must be F (Float).
1264 */
1265 if (src.type == BRW_REGISTER_TYPE_UD)
1266 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1267
1268 assert(src.type == BRW_REGISTER_TYPE_W ||
1269 src.type == BRW_REGISTER_TYPE_UW ||
1270 src.type == BRW_REGISTER_TYPE_HF);
1271 }
1272
1273 if (devinfo->gen >= 8) {
1274 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1275 } else {
1276 assert(devinfo->gen == 7);
1277 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1278 }
1279 }
1280
1281
1282 void brw_NOP(struct brw_codegen *p)
1283 {
1284 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1285 memset(insn, 0, sizeof(*insn));
1286 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1287 }
1288
1289 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1290 {
1291 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1292 brw_inst_set_cond_modifier(p->devinfo, insn, func);
1293 }
1294
1295 /***********************************************************************
1296 * Comparisons, if/else/endif
1297 */
1298
1299 brw_inst *
1300 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1301 unsigned predicate_control)
1302 {
1303 const struct gen_device_info *devinfo = p->devinfo;
1304 struct brw_reg ip = brw_ip_reg();
1305 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1306
1307 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1308 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1309 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1310 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1311
1312 return inst;
1313 }
1314
1315 static void
1316 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1317 {
1318 p->if_stack[p->if_stack_depth] = inst - p->store;
1319
1320 p->if_stack_depth++;
1321 if (p->if_stack_array_size <= p->if_stack_depth) {
1322 p->if_stack_array_size *= 2;
1323 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1324 p->if_stack_array_size);
1325 }
1326 }
1327
1328 static brw_inst *
1329 pop_if_stack(struct brw_codegen *p)
1330 {
1331 p->if_stack_depth--;
1332 return &p->store[p->if_stack[p->if_stack_depth]];
1333 }
1334
1335 static void
1336 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1337 {
1338 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1339 p->loop_stack_array_size *= 2;
1340 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1341 p->loop_stack_array_size);
1342 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1343 p->loop_stack_array_size);
1344 }
1345
1346 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1347 p->loop_stack_depth++;
1348 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1349 }
1350
1351 static brw_inst *
1352 get_inner_do_insn(struct brw_codegen *p)
1353 {
1354 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1355 }
1356
1357 /* EU takes the value from the flag register and pushes it onto some
1358 * sort of a stack (presumably merging with any flag value already on
1359 * the stack). Within an if block, the flags at the top of the stack
1360 * control execution on each channel of the unit, eg. on each of the
1361 * 16 pixel values in our wm programs.
1362 *
1363 * When the matching 'else' instruction is reached (presumably by
1364 * countdown of the instruction count patched in by our ELSE/ENDIF
1365 * functions), the relevant flags are inverted.
1366 *
1367 * When the matching 'endif' instruction is reached, the flags are
1368 * popped off. If the stack is now empty, normal execution resumes.
1369 */
1370 brw_inst *
1371 brw_IF(struct brw_codegen *p, unsigned execute_size)
1372 {
1373 const struct gen_device_info *devinfo = p->devinfo;
1374 brw_inst *insn;
1375
1376 insn = next_insn(p, BRW_OPCODE_IF);
1377
1378 /* Override the defaults for this instruction:
1379 */
1380 if (devinfo->gen < 6) {
1381 brw_set_dest(p, insn, brw_ip_reg());
1382 brw_set_src0(p, insn, brw_ip_reg());
1383 brw_set_src1(p, insn, brw_imm_d(0x0));
1384 } else if (devinfo->gen == 6) {
1385 brw_set_dest(p, insn, brw_imm_w(0));
1386 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1387 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1388 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1389 } else if (devinfo->gen == 7) {
1390 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1391 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1392 brw_set_src1(p, insn, brw_imm_w(0));
1393 brw_inst_set_jip(devinfo, insn, 0);
1394 brw_inst_set_uip(devinfo, insn, 0);
1395 } else {
1396 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1397 if (devinfo->gen < 12)
1398 brw_set_src0(p, insn, brw_imm_d(0));
1399 brw_inst_set_jip(devinfo, insn, 0);
1400 brw_inst_set_uip(devinfo, insn, 0);
1401 }
1402
1403 brw_inst_set_exec_size(devinfo, insn, execute_size);
1404 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1405 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1406 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1407 if (!p->single_program_flow && devinfo->gen < 6)
1408 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1409
1410 push_if_stack(p, insn);
1411 p->if_depth_in_loop[p->loop_stack_depth]++;
1412 return insn;
1413 }
1414
1415 /* This function is only used for gen6-style IF instructions with an
1416 * embedded comparison (conditional modifier). It is not used on gen7.
1417 */
1418 brw_inst *
1419 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1420 struct brw_reg src0, struct brw_reg src1)
1421 {
1422 const struct gen_device_info *devinfo = p->devinfo;
1423 brw_inst *insn;
1424
1425 insn = next_insn(p, BRW_OPCODE_IF);
1426
1427 brw_set_dest(p, insn, brw_imm_w(0));
1428 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1429 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1430 brw_set_src0(p, insn, src0);
1431 brw_set_src1(p, insn, src1);
1432
1433 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1434 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1435 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1436
1437 push_if_stack(p, insn);
1438 return insn;
1439 }
1440
1441 /**
1442 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1443 */
1444 static void
1445 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1446 brw_inst *if_inst, brw_inst *else_inst)
1447 {
1448 const struct gen_device_info *devinfo = p->devinfo;
1449
1450 /* The next instruction (where the ENDIF would be, if it existed) */
1451 brw_inst *next_inst = &p->store[p->nr_insn];
1452
1453 assert(p->single_program_flow);
1454 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1455 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1456 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1457
1458 /* Convert IF to an ADD instruction that moves the instruction pointer
1459 * to the first instruction of the ELSE block. If there is no ELSE
1460 * block, point to where ENDIF would be. Reverse the predicate.
1461 *
1462 * There's no need to execute an ENDIF since we don't need to do any
1463 * stack operations, and if we're currently executing, we just want to
1464 * continue normally.
1465 */
1466 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1467 brw_inst_set_pred_inv(devinfo, if_inst, true);
1468
1469 if (else_inst != NULL) {
1470 /* Convert ELSE to an ADD instruction that points where the ENDIF
1471 * would be.
1472 */
1473 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1474
1475 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1476 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1477 } else {
1478 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1479 }
1480 }
1481
1482 /**
1483 * Patch IF and ELSE instructions with appropriate jump targets.
1484 */
1485 static void
1486 patch_IF_ELSE(struct brw_codegen *p,
1487 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1488 {
1489 const struct gen_device_info *devinfo = p->devinfo;
1490
1491 /* We shouldn't be patching IF and ELSE instructions in single program flow
1492 * mode when gen < 6, because in single program flow mode on those
1493 * platforms, we convert flow control instructions to conditional ADDs that
1494 * operate on IP (see brw_ENDIF).
1495 *
1496 * However, on Gen6, writing to IP doesn't work in single program flow mode
1497 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1498 * not be updated by non-flow control instructions."). And on later
1499 * platforms, there is no significant benefit to converting control flow
1500 * instructions to conditional ADDs. So we do patch IF and ELSE
1501 * instructions in single program flow mode on those platforms.
1502 */
1503 if (devinfo->gen < 6)
1504 assert(!p->single_program_flow);
1505
1506 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1507 assert(endif_inst != NULL);
1508 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1509
1510 unsigned br = brw_jump_scale(devinfo);
1511
1512 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1513 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1514
1515 if (else_inst == NULL) {
1516 /* Patch IF -> ENDIF */
1517 if (devinfo->gen < 6) {
1518 /* Turn it into an IFF, which means no mask stack operations for
1519 * all-false and jumping past the ENDIF.
1520 */
1521 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1522 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1523 br * (endif_inst - if_inst + 1));
1524 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1525 } else if (devinfo->gen == 6) {
1526 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1527 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1528 } else {
1529 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1530 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1531 }
1532 } else {
1533 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1534
1535 /* Patch IF -> ELSE */
1536 if (devinfo->gen < 6) {
1537 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1538 br * (else_inst - if_inst));
1539 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1540 } else if (devinfo->gen == 6) {
1541 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1542 br * (else_inst - if_inst + 1));
1543 }
1544
1545 /* Patch ELSE -> ENDIF */
1546 if (devinfo->gen < 6) {
1547 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1548 * matching ENDIF.
1549 */
1550 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1551 br * (endif_inst - else_inst + 1));
1552 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1553 } else if (devinfo->gen == 6) {
1554 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1555 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1556 br * (endif_inst - else_inst));
1557 } else {
1558 /* The IF instruction's JIP should point just past the ELSE */
1559 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1560 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1561 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1562 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1563 if (devinfo->gen >= 8) {
1564 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1565 * should point to ENDIF.
1566 */
1567 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1568 }
1569 }
1570 }
1571 }
1572
1573 void
1574 brw_ELSE(struct brw_codegen *p)
1575 {
1576 const struct gen_device_info *devinfo = p->devinfo;
1577 brw_inst *insn;
1578
1579 insn = next_insn(p, BRW_OPCODE_ELSE);
1580
1581 if (devinfo->gen < 6) {
1582 brw_set_dest(p, insn, brw_ip_reg());
1583 brw_set_src0(p, insn, brw_ip_reg());
1584 brw_set_src1(p, insn, brw_imm_d(0x0));
1585 } else if (devinfo->gen == 6) {
1586 brw_set_dest(p, insn, brw_imm_w(0));
1587 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1588 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1590 } else if (devinfo->gen == 7) {
1591 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1592 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1593 brw_set_src1(p, insn, brw_imm_w(0));
1594 brw_inst_set_jip(devinfo, insn, 0);
1595 brw_inst_set_uip(devinfo, insn, 0);
1596 } else {
1597 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1598 if (devinfo->gen < 12)
1599 brw_set_src0(p, insn, brw_imm_d(0));
1600 brw_inst_set_jip(devinfo, insn, 0);
1601 brw_inst_set_uip(devinfo, insn, 0);
1602 }
1603
1604 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1605 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1606 if (!p->single_program_flow && devinfo->gen < 6)
1607 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1608
1609 push_if_stack(p, insn);
1610 }
1611
1612 void
1613 brw_ENDIF(struct brw_codegen *p)
1614 {
1615 const struct gen_device_info *devinfo = p->devinfo;
1616 brw_inst *insn = NULL;
1617 brw_inst *else_inst = NULL;
1618 brw_inst *if_inst = NULL;
1619 brw_inst *tmp;
1620 bool emit_endif = true;
1621
1622 /* In single program flow mode, we can express IF and ELSE instructions
1623 * equivalently as ADD instructions that operate on IP. On platforms prior
1624 * to Gen6, flow control instructions cause an implied thread switch, so
1625 * this is a significant savings.
1626 *
1627 * However, on Gen6, writing to IP doesn't work in single program flow mode
1628 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1629 * not be updated by non-flow control instructions."). And on later
1630 * platforms, there is no significant benefit to converting control flow
1631 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1632 * Gen5.
1633 */
1634 if (devinfo->gen < 6 && p->single_program_flow)
1635 emit_endif = false;
1636
1637 /*
1638 * A single next_insn() may change the base address of instruction store
1639 * memory(p->store), so call it first before referencing the instruction
1640 * store pointer from an index
1641 */
1642 if (emit_endif)
1643 insn = next_insn(p, BRW_OPCODE_ENDIF);
1644
1645 /* Pop the IF and (optional) ELSE instructions from the stack */
1646 p->if_depth_in_loop[p->loop_stack_depth]--;
1647 tmp = pop_if_stack(p);
1648 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1649 else_inst = tmp;
1650 tmp = pop_if_stack(p);
1651 }
1652 if_inst = tmp;
1653
1654 if (!emit_endif) {
1655 /* ENDIF is useless; don't bother emitting it. */
1656 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1657 return;
1658 }
1659
1660 if (devinfo->gen < 6) {
1661 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1662 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1663 brw_set_src1(p, insn, brw_imm_d(0x0));
1664 } else if (devinfo->gen == 6) {
1665 brw_set_dest(p, insn, brw_imm_w(0));
1666 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1667 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1668 } else if (devinfo->gen == 7) {
1669 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1670 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1671 brw_set_src1(p, insn, brw_imm_w(0));
1672 } else {
1673 brw_set_src0(p, insn, brw_imm_d(0));
1674 }
1675
1676 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1677 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1678 if (devinfo->gen < 6)
1679 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1680
1681 /* Also pop item off the stack in the endif instruction: */
1682 if (devinfo->gen < 6) {
1683 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1684 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1685 } else if (devinfo->gen == 6) {
1686 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1687 } else {
1688 brw_inst_set_jip(devinfo, insn, 2);
1689 }
1690 patch_IF_ELSE(p, if_inst, else_inst, insn);
1691 }
1692
1693 brw_inst *
1694 brw_BREAK(struct brw_codegen *p)
1695 {
1696 const struct gen_device_info *devinfo = p->devinfo;
1697 brw_inst *insn;
1698
1699 insn = next_insn(p, BRW_OPCODE_BREAK);
1700 if (devinfo->gen >= 8) {
1701 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1702 brw_set_src0(p, insn, brw_imm_d(0x0));
1703 } else if (devinfo->gen >= 6) {
1704 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1705 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1706 brw_set_src1(p, insn, brw_imm_d(0x0));
1707 } else {
1708 brw_set_dest(p, insn, brw_ip_reg());
1709 brw_set_src0(p, insn, brw_ip_reg());
1710 brw_set_src1(p, insn, brw_imm_d(0x0));
1711 brw_inst_set_gen4_pop_count(devinfo, insn,
1712 p->if_depth_in_loop[p->loop_stack_depth]);
1713 }
1714 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1715 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1716
1717 return insn;
1718 }
1719
1720 brw_inst *
1721 brw_CONT(struct brw_codegen *p)
1722 {
1723 const struct gen_device_info *devinfo = p->devinfo;
1724 brw_inst *insn;
1725
1726 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1727 brw_set_dest(p, insn, brw_ip_reg());
1728 if (devinfo->gen >= 8) {
1729 brw_set_src0(p, insn, brw_imm_d(0x0));
1730 } else {
1731 brw_set_src0(p, insn, brw_ip_reg());
1732 brw_set_src1(p, insn, brw_imm_d(0x0));
1733 }
1734
1735 if (devinfo->gen < 6) {
1736 brw_inst_set_gen4_pop_count(devinfo, insn,
1737 p->if_depth_in_loop[p->loop_stack_depth]);
1738 }
1739 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1740 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1741 return insn;
1742 }
1743
1744 brw_inst *
1745 gen6_HALT(struct brw_codegen *p)
1746 {
1747 const struct gen_device_info *devinfo = p->devinfo;
1748 brw_inst *insn;
1749
1750 insn = next_insn(p, BRW_OPCODE_HALT);
1751 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1752 if (devinfo->gen < 8) {
1753 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1754 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1755 } else if (devinfo->gen < 12) {
1756 brw_set_src0(p, insn, brw_imm_d(0x0));
1757 }
1758
1759 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1760 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1761 return insn;
1762 }
1763
1764 /* DO/WHILE loop:
1765 *
1766 * The DO/WHILE is just an unterminated loop -- break or continue are
1767 * used for control within the loop. We have a few ways they can be
1768 * done.
1769 *
1770 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1771 * jip and no DO instruction.
1772 *
1773 * For non-uniform control flow pre-gen6, there's a DO instruction to
1774 * push the mask, and a WHILE to jump back, and BREAK to get out and
1775 * pop the mask.
1776 *
1777 * For gen6, there's no more mask stack, so no need for DO. WHILE
1778 * just points back to the first instruction of the loop.
1779 */
1780 brw_inst *
1781 brw_DO(struct brw_codegen *p, unsigned execute_size)
1782 {
1783 const struct gen_device_info *devinfo = p->devinfo;
1784
1785 if (devinfo->gen >= 6 || p->single_program_flow) {
1786 push_loop_stack(p, &p->store[p->nr_insn]);
1787 return &p->store[p->nr_insn];
1788 } else {
1789 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1790
1791 push_loop_stack(p, insn);
1792
1793 /* Override the defaults for this instruction:
1794 */
1795 brw_set_dest(p, insn, brw_null_reg());
1796 brw_set_src0(p, insn, brw_null_reg());
1797 brw_set_src1(p, insn, brw_null_reg());
1798
1799 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1800 brw_inst_set_exec_size(devinfo, insn, execute_size);
1801 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1802
1803 return insn;
1804 }
1805 }
1806
1807 /**
1808 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1809 * instruction here.
1810 *
1811 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1812 * nesting, since it can always just point to the end of the block/current loop.
1813 */
1814 static void
1815 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1816 {
1817 const struct gen_device_info *devinfo = p->devinfo;
1818 brw_inst *do_inst = get_inner_do_insn(p);
1819 brw_inst *inst;
1820 unsigned br = brw_jump_scale(devinfo);
1821
1822 assert(devinfo->gen < 6);
1823
1824 for (inst = while_inst - 1; inst != do_inst; inst--) {
1825 /* If the jump count is != 0, that means that this instruction has already
1826 * been patched because it's part of a loop inside of the one we're
1827 * patching.
1828 */
1829 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1830 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1831 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1832 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1833 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1834 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1835 }
1836 }
1837 }
1838
1839 brw_inst *
1840 brw_WHILE(struct brw_codegen *p)
1841 {
1842 const struct gen_device_info *devinfo = p->devinfo;
1843 brw_inst *insn, *do_insn;
1844 unsigned br = brw_jump_scale(devinfo);
1845
1846 if (devinfo->gen >= 6) {
1847 insn = next_insn(p, BRW_OPCODE_WHILE);
1848 do_insn = get_inner_do_insn(p);
1849
1850 if (devinfo->gen >= 8) {
1851 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1852 if (devinfo->gen < 12)
1853 brw_set_src0(p, insn, brw_imm_d(0));
1854 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1855 } else if (devinfo->gen == 7) {
1856 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1857 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1858 brw_set_src1(p, insn, brw_imm_w(0));
1859 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1860 } else {
1861 brw_set_dest(p, insn, brw_imm_w(0));
1862 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1863 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1864 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1865 }
1866
1867 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1868
1869 } else {
1870 if (p->single_program_flow) {
1871 insn = next_insn(p, BRW_OPCODE_ADD);
1872 do_insn = get_inner_do_insn(p);
1873
1874 brw_set_dest(p, insn, brw_ip_reg());
1875 brw_set_src0(p, insn, brw_ip_reg());
1876 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1877 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1878 } else {
1879 insn = next_insn(p, BRW_OPCODE_WHILE);
1880 do_insn = get_inner_do_insn(p);
1881
1882 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1883
1884 brw_set_dest(p, insn, brw_ip_reg());
1885 brw_set_src0(p, insn, brw_ip_reg());
1886 brw_set_src1(p, insn, brw_imm_d(0));
1887
1888 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1889 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1890 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1891
1892 brw_patch_break_cont(p, insn);
1893 }
1894 }
1895 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1896
1897 p->loop_stack_depth--;
1898
1899 return insn;
1900 }
1901
1902 /* FORWARD JUMPS:
1903 */
1904 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1905 {
1906 const struct gen_device_info *devinfo = p->devinfo;
1907 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1908 unsigned jmpi = 1;
1909
1910 if (devinfo->gen >= 5)
1911 jmpi = 2;
1912
1913 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1914 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1915
1916 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1917 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1918 }
1919
1920 /* To integrate with the above, it makes sense that the comparison
1921 * instruction should populate the flag register. It might be simpler
1922 * just to use the flag reg for most WM tasks?
1923 */
1924 void brw_CMP(struct brw_codegen *p,
1925 struct brw_reg dest,
1926 unsigned conditional,
1927 struct brw_reg src0,
1928 struct brw_reg src1)
1929 {
1930 const struct gen_device_info *devinfo = p->devinfo;
1931 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1932
1933 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1934 brw_set_dest(p, insn, dest);
1935 brw_set_src0(p, insn, src0);
1936 brw_set_src1(p, insn, src1);
1937
1938 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1939 * page says:
1940 * "Any CMP instruction with a null destination must use a {switch}."
1941 *
1942 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1943 * mentioned on their work-arounds pages.
1944 */
1945 if (devinfo->gen == 7) {
1946 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1947 dest.nr == BRW_ARF_NULL) {
1948 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1949 }
1950 }
1951 }
1952
1953 /***********************************************************************
1954 * Helpers for the various SEND message types:
1955 */
1956
1957 /** Extended math function, float[8].
1958 */
1959 void gen4_math(struct brw_codegen *p,
1960 struct brw_reg dest,
1961 unsigned function,
1962 unsigned msg_reg_nr,
1963 struct brw_reg src,
1964 unsigned precision )
1965 {
1966 const struct gen_device_info *devinfo = p->devinfo;
1967 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1968 unsigned data_type;
1969 if (has_scalar_region(src)) {
1970 data_type = BRW_MATH_DATA_SCALAR;
1971 } else {
1972 data_type = BRW_MATH_DATA_VECTOR;
1973 }
1974
1975 assert(devinfo->gen < 6);
1976
1977 /* Example code doesn't set predicate_control for send
1978 * instructions.
1979 */
1980 brw_inst_set_pred_control(devinfo, insn, 0);
1981 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1982
1983 brw_set_dest(p, insn, dest);
1984 brw_set_src0(p, insn, src);
1985 brw_set_math_message(p,
1986 insn,
1987 function,
1988 src.type == BRW_REGISTER_TYPE_D,
1989 precision,
1990 data_type);
1991 }
1992
1993 void gen6_math(struct brw_codegen *p,
1994 struct brw_reg dest,
1995 unsigned function,
1996 struct brw_reg src0,
1997 struct brw_reg src1)
1998 {
1999 const struct gen_device_info *devinfo = p->devinfo;
2000 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2001
2002 assert(devinfo->gen >= 6);
2003
2004 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2005 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2006
2007 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2008 if (devinfo->gen == 6) {
2009 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2010 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2011 }
2012
2013 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2014 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2015 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2016 assert(src0.type != BRW_REGISTER_TYPE_F);
2017 assert(src1.type != BRW_REGISTER_TYPE_F);
2018 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2019 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2020 } else {
2021 assert(src0.type == BRW_REGISTER_TYPE_F ||
2022 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2023 assert(src1.type == BRW_REGISTER_TYPE_F ||
2024 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2025 }
2026
2027 /* Source modifiers are ignored for extended math instructions on Gen6. */
2028 if (devinfo->gen == 6) {
2029 assert(!src0.negate);
2030 assert(!src0.abs);
2031 assert(!src1.negate);
2032 assert(!src1.abs);
2033 }
2034
2035 brw_inst_set_math_function(devinfo, insn, function);
2036
2037 brw_set_dest(p, insn, dest);
2038 brw_set_src0(p, insn, src0);
2039 brw_set_src1(p, insn, src1);
2040 }
2041
2042 /**
2043 * Return the right surface index to access the thread scratch space using
2044 * stateless dataport messages.
2045 */
2046 unsigned
2047 brw_scratch_surface_idx(const struct brw_codegen *p)
2048 {
2049 /* The scratch space is thread-local so IA coherency is unnecessary. */
2050 if (p->devinfo->gen >= 8)
2051 return GEN8_BTI_STATELESS_NON_COHERENT;
2052 else
2053 return BRW_BTI_STATELESS;
2054 }
2055
2056 /**
2057 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2058 * using a constant offset per channel.
2059 *
2060 * The offset must be aligned to oword size (16 bytes). Used for
2061 * register spilling.
2062 */
2063 void brw_oword_block_write_scratch(struct brw_codegen *p,
2064 struct brw_reg mrf,
2065 int num_regs,
2066 unsigned offset)
2067 {
2068 const struct gen_device_info *devinfo = p->devinfo;
2069 const unsigned target_cache =
2070 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2071 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2072 BRW_SFID_DATAPORT_WRITE);
2073 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2074 uint32_t msg_type;
2075
2076 if (devinfo->gen >= 6)
2077 offset /= 16;
2078
2079 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2080
2081 const unsigned mlen = 1 + num_regs;
2082
2083 /* Set up the message header. This is g0, with g0.2 filled with
2084 * the offset. We don't want to leave our offset around in g0 or
2085 * it'll screw up texture samples, so set it up inside the message
2086 * reg.
2087 */
2088 {
2089 brw_push_insn_state(p);
2090 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2091 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2092 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2093 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2094
2095 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2096
2097 /* set message header global offset field (reg 0, element 2) */
2098 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2099 brw_set_default_swsb(p, tgl_swsb_null());
2100 brw_MOV(p,
2101 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2102 mrf.nr,
2103 2), BRW_REGISTER_TYPE_UD),
2104 brw_imm_ud(offset));
2105
2106 brw_pop_insn_state(p);
2107 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2108 }
2109
2110 {
2111 struct brw_reg dest;
2112 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2113 int send_commit_msg;
2114 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2115 BRW_REGISTER_TYPE_UW);
2116
2117 brw_inst_set_sfid(devinfo, insn, target_cache);
2118 brw_inst_set_compression(devinfo, insn, false);
2119
2120 if (brw_inst_exec_size(devinfo, insn) >= 16)
2121 src_header = vec16(src_header);
2122
2123 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2124 if (devinfo->gen < 6)
2125 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2126
2127 /* Until gen6, writes followed by reads from the same location
2128 * are not guaranteed to be ordered unless write_commit is set.
2129 * If set, then a no-op write is issued to the destination
2130 * register to set a dependency, and a read from the destination
2131 * can be used to ensure the ordering.
2132 *
2133 * For gen6, only writes between different threads need ordering
2134 * protection. Our use of DP writes is all about register
2135 * spilling within a thread.
2136 */
2137 if (devinfo->gen >= 6) {
2138 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2139 send_commit_msg = 0;
2140 } else {
2141 dest = src_header;
2142 send_commit_msg = 1;
2143 }
2144
2145 brw_set_dest(p, insn, dest);
2146 if (devinfo->gen >= 6) {
2147 brw_set_src0(p, insn, mrf);
2148 } else {
2149 brw_set_src0(p, insn, brw_null_reg());
2150 }
2151
2152 if (devinfo->gen >= 6)
2153 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2154 else
2155 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2156
2157 brw_set_desc(p, insn,
2158 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2159 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2160 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2161 msg_type, 0, /* not a render target */
2162 send_commit_msg));
2163 }
2164 }
2165
2166
2167 /**
2168 * Read a block of owords (half a GRF each) from the scratch buffer
2169 * using a constant index per channel.
2170 *
2171 * Offset must be aligned to oword size (16 bytes). Used for register
2172 * spilling.
2173 */
2174 void
2175 brw_oword_block_read_scratch(struct brw_codegen *p,
2176 struct brw_reg dest,
2177 struct brw_reg mrf,
2178 int num_regs,
2179 unsigned offset)
2180 {
2181 const struct gen_device_info *devinfo = p->devinfo;
2182 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2183
2184 if (devinfo->gen >= 6)
2185 offset /= 16;
2186
2187 if (p->devinfo->gen >= 7) {
2188 /* On gen 7 and above, we no longer have message registers and we can
2189 * send from any register we want. By using the destination register
2190 * for the message, we guarantee that the implied message write won't
2191 * accidentally overwrite anything. This has been a problem because
2192 * the MRF registers and source for the final FB write are both fixed
2193 * and may overlap.
2194 */
2195 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2196 } else {
2197 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2198 }
2199 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2200
2201 const unsigned rlen = num_regs;
2202 const unsigned target_cache =
2203 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2204 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2205 BRW_SFID_DATAPORT_READ);
2206
2207 {
2208 brw_push_insn_state(p);
2209 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2210 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2211 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2212 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2213
2214 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2215
2216 /* set message header global offset field (reg 0, element 2) */
2217 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2218 brw_set_default_swsb(p, tgl_swsb_null());
2219 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2220
2221 brw_pop_insn_state(p);
2222 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2223 }
2224
2225 {
2226 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2227
2228 brw_inst_set_sfid(devinfo, insn, target_cache);
2229 assert(brw_inst_pred_control(devinfo, insn) == 0);
2230 brw_inst_set_compression(devinfo, insn, false);
2231
2232 brw_set_dest(p, insn, dest); /* UW? */
2233 if (devinfo->gen >= 6) {
2234 brw_set_src0(p, insn, mrf);
2235 } else {
2236 brw_set_src0(p, insn, brw_null_reg());
2237 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2238 }
2239
2240 brw_set_desc(p, insn,
2241 brw_message_desc(devinfo, 1, rlen, true) |
2242 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2243 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2244 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2245 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2246 }
2247 }
2248
2249 void
2250 gen7_block_read_scratch(struct brw_codegen *p,
2251 struct brw_reg dest,
2252 int num_regs,
2253 unsigned offset)
2254 {
2255 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2256 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2257
2258 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2259
2260 /* The HW requires that the header is present; this is to get the g0.5
2261 * scratch offset.
2262 */
2263 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2264
2265 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2266 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2267 * is 32 bytes, which happens to be the size of a register.
2268 */
2269 offset /= REG_SIZE;
2270 assert(offset < (1 << 12));
2271
2272 gen7_set_dp_scratch_message(p, insn,
2273 false, /* scratch read */
2274 false, /* OWords */
2275 false, /* invalidate after read */
2276 num_regs,
2277 offset,
2278 1, /* mlen: just g0 */
2279 num_regs, /* rlen */
2280 true); /* header present */
2281 }
2282
2283 /**
2284 * Read float[4] vectors from the data port constant cache.
2285 * Location (in buffer) should be a multiple of 16.
2286 * Used for fetching shader constants.
2287 */
2288 void brw_oword_block_read(struct brw_codegen *p,
2289 struct brw_reg dest,
2290 struct brw_reg mrf,
2291 uint32_t offset,
2292 uint32_t bind_table_index)
2293 {
2294 const struct gen_device_info *devinfo = p->devinfo;
2295 const unsigned target_cache =
2296 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2297 BRW_SFID_DATAPORT_READ);
2298 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2299 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2300
2301 /* On newer hardware, offset is in units of owords. */
2302 if (devinfo->gen >= 6)
2303 offset /= 16;
2304
2305 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2306
2307 brw_push_insn_state(p);
2308 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2309 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2310 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2311
2312 brw_push_insn_state(p);
2313 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2314 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2315 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2316
2317 /* set message header global offset field (reg 0, element 2) */
2318 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2319 brw_set_default_swsb(p, tgl_swsb_null());
2320 brw_MOV(p,
2321 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2322 mrf.nr,
2323 2), BRW_REGISTER_TYPE_UD),
2324 brw_imm_ud(offset));
2325 brw_pop_insn_state(p);
2326
2327 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2328
2329 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2330
2331 brw_inst_set_sfid(devinfo, insn, target_cache);
2332
2333 /* cast dest to a uword[8] vector */
2334 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2335
2336 brw_set_dest(p, insn, dest);
2337 if (devinfo->gen >= 6) {
2338 brw_set_src0(p, insn, mrf);
2339 } else {
2340 brw_set_src0(p, insn, brw_null_reg());
2341 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2342 }
2343
2344 brw_set_desc(p, insn,
2345 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2346 brw_dp_read_desc(devinfo, bind_table_index,
2347 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2348 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2349 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2350
2351 brw_pop_insn_state(p);
2352 }
2353
2354 brw_inst *
2355 brw_fb_WRITE(struct brw_codegen *p,
2356 struct brw_reg payload,
2357 struct brw_reg implied_header,
2358 unsigned msg_control,
2359 unsigned binding_table_index,
2360 unsigned msg_length,
2361 unsigned response_length,
2362 bool eot,
2363 bool last_render_target,
2364 bool header_present)
2365 {
2366 const struct gen_device_info *devinfo = p->devinfo;
2367 const unsigned target_cache =
2368 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2369 BRW_SFID_DATAPORT_WRITE);
2370 brw_inst *insn;
2371 unsigned msg_type;
2372 struct brw_reg dest, src0;
2373
2374 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2375 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2376 else
2377 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2378
2379 if (devinfo->gen >= 6) {
2380 insn = next_insn(p, BRW_OPCODE_SENDC);
2381 } else {
2382 insn = next_insn(p, BRW_OPCODE_SEND);
2383 }
2384 brw_inst_set_sfid(devinfo, insn, target_cache);
2385 brw_inst_set_compression(devinfo, insn, false);
2386
2387 if (devinfo->gen >= 6) {
2388 /* headerless version, just submit color payload */
2389 src0 = payload;
2390
2391 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2392 } else {
2393 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2394 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2395 src0 = implied_header;
2396
2397 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2398 }
2399
2400 brw_set_dest(p, insn, dest);
2401 brw_set_src0(p, insn, src0);
2402 brw_set_desc(p, insn,
2403 brw_message_desc(devinfo, msg_length, response_length,
2404 header_present) |
2405 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2406 msg_type, last_render_target,
2407 0 /* send_commit_msg */));
2408 brw_inst_set_eot(devinfo, insn, eot);
2409
2410 return insn;
2411 }
2412
2413 brw_inst *
2414 gen9_fb_READ(struct brw_codegen *p,
2415 struct brw_reg dst,
2416 struct brw_reg payload,
2417 unsigned binding_table_index,
2418 unsigned msg_length,
2419 unsigned response_length,
2420 bool per_sample)
2421 {
2422 const struct gen_device_info *devinfo = p->devinfo;
2423 assert(devinfo->gen >= 9);
2424 const unsigned msg_subtype =
2425 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2426 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2427
2428 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2429 brw_set_dest(p, insn, dst);
2430 brw_set_src0(p, insn, payload);
2431 brw_set_desc(
2432 p, insn,
2433 brw_message_desc(devinfo, msg_length, response_length, true) |
2434 brw_dp_read_desc(devinfo, binding_table_index,
2435 per_sample << 5 | msg_subtype,
2436 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2437 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2438 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2439
2440 return insn;
2441 }
2442
2443 /**
2444 * Texture sample instruction.
2445 * Note: the msg_type plus msg_length values determine exactly what kind
2446 * of sampling operation is performed. See volume 4, page 161 of docs.
2447 */
2448 void brw_SAMPLE(struct brw_codegen *p,
2449 struct brw_reg dest,
2450 unsigned msg_reg_nr,
2451 struct brw_reg src0,
2452 unsigned binding_table_index,
2453 unsigned sampler,
2454 unsigned msg_type,
2455 unsigned response_length,
2456 unsigned msg_length,
2457 unsigned header_present,
2458 unsigned simd_mode,
2459 unsigned return_format)
2460 {
2461 const struct gen_device_info *devinfo = p->devinfo;
2462 brw_inst *insn;
2463
2464 if (msg_reg_nr != -1)
2465 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2466
2467 insn = next_insn(p, BRW_OPCODE_SEND);
2468 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2469 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2470
2471 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2472 *
2473 * "Instruction compression is not allowed for this instruction (that
2474 * is, send). The hardware behavior is undefined if this instruction is
2475 * set as compressed. However, compress control can be set to "SecHalf"
2476 * to affect the EMask generation."
2477 *
2478 * No similar wording is found in later PRMs, but there are examples
2479 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2480 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2481 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2482 */
2483 brw_inst_set_compression(devinfo, insn, false);
2484
2485 if (devinfo->gen < 6)
2486 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2487
2488 brw_set_dest(p, insn, dest);
2489 brw_set_src0(p, insn, src0);
2490 brw_set_desc(p, insn,
2491 brw_message_desc(devinfo, msg_length, response_length,
2492 header_present) |
2493 brw_sampler_desc(devinfo, binding_table_index, sampler,
2494 msg_type, simd_mode, return_format));
2495 }
2496
2497 /* Adjust the message header's sampler state pointer to
2498 * select the correct group of 16 samplers.
2499 */
2500 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2501 struct brw_reg header,
2502 struct brw_reg sampler_index)
2503 {
2504 /* The "Sampler Index" field can only store values between 0 and 15.
2505 * However, we can add an offset to the "Sampler State Pointer"
2506 * field, effectively selecting a different set of 16 samplers.
2507 *
2508 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2509 * offset, and each sampler state is only 16-bytes, so we can't
2510 * exclusively use the offset - we have to use both.
2511 */
2512
2513 const struct gen_device_info *devinfo = p->devinfo;
2514
2515 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2516 const int sampler_state_size = 16; /* 16 bytes */
2517 uint32_t sampler = sampler_index.ud;
2518
2519 if (sampler >= 16) {
2520 assert(devinfo->is_haswell || devinfo->gen >= 8);
2521 brw_ADD(p,
2522 get_element_ud(header, 3),
2523 get_element_ud(brw_vec8_grf(0, 0), 3),
2524 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2525 }
2526 } else {
2527 /* Non-const sampler array indexing case */
2528 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2529 return;
2530 }
2531
2532 struct brw_reg temp = get_element_ud(header, 3);
2533
2534 brw_push_insn_state(p);
2535 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2536 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2537 brw_SHL(p, temp, temp, brw_imm_ud(4));
2538 brw_ADD(p,
2539 get_element_ud(header, 3),
2540 get_element_ud(brw_vec8_grf(0, 0), 3),
2541 temp);
2542 brw_pop_insn_state(p);
2543 }
2544 }
2545
2546 /* All these variables are pretty confusing - we might be better off
2547 * using bitmasks and macros for this, in the old style. Or perhaps
2548 * just having the caller instantiate the fields in dword3 itself.
2549 */
2550 void brw_urb_WRITE(struct brw_codegen *p,
2551 struct brw_reg dest,
2552 unsigned msg_reg_nr,
2553 struct brw_reg src0,
2554 enum brw_urb_write_flags flags,
2555 unsigned msg_length,
2556 unsigned response_length,
2557 unsigned offset,
2558 unsigned swizzle)
2559 {
2560 const struct gen_device_info *devinfo = p->devinfo;
2561 brw_inst *insn;
2562
2563 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2564
2565 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2566 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2567 brw_push_insn_state(p);
2568 brw_set_default_access_mode(p, BRW_ALIGN_1);
2569 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2570 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2571 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2572 BRW_REGISTER_TYPE_UD),
2573 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2574 brw_imm_ud(0xff00));
2575 brw_pop_insn_state(p);
2576 }
2577
2578 insn = next_insn(p, BRW_OPCODE_SEND);
2579
2580 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2581
2582 brw_set_dest(p, insn, dest);
2583 brw_set_src0(p, insn, src0);
2584 brw_set_src1(p, insn, brw_imm_d(0));
2585
2586 if (devinfo->gen < 6)
2587 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2588
2589 brw_set_urb_message(p,
2590 insn,
2591 flags,
2592 msg_length,
2593 response_length,
2594 offset,
2595 swizzle);
2596 }
2597
2598 void
2599 brw_send_indirect_message(struct brw_codegen *p,
2600 unsigned sfid,
2601 struct brw_reg dst,
2602 struct brw_reg payload,
2603 struct brw_reg desc,
2604 unsigned desc_imm,
2605 bool eot)
2606 {
2607 const struct gen_device_info *devinfo = p->devinfo;
2608 struct brw_inst *send;
2609
2610 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2611
2612 assert(desc.type == BRW_REGISTER_TYPE_UD);
2613
2614 if (desc.file == BRW_IMMEDIATE_VALUE) {
2615 send = next_insn(p, BRW_OPCODE_SEND);
2616 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2617 brw_set_desc(p, send, desc.ud | desc_imm);
2618 } else {
2619 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2620 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2621
2622 brw_push_insn_state(p);
2623 brw_set_default_access_mode(p, BRW_ALIGN_1);
2624 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2625 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2626 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2627 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2628
2629 /* Load the indirect descriptor to an address register using OR so the
2630 * caller can specify additional descriptor bits with the desc_imm
2631 * immediate.
2632 */
2633 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2634
2635 brw_pop_insn_state(p);
2636
2637 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2638 send = next_insn(p, BRW_OPCODE_SEND);
2639 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2640
2641 if (devinfo->gen >= 12)
2642 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2643 else
2644 brw_set_src1(p, send, addr);
2645 }
2646
2647 brw_set_dest(p, send, dst);
2648 brw_inst_set_sfid(devinfo, send, sfid);
2649 brw_inst_set_eot(devinfo, send, eot);
2650 }
2651
2652 void
2653 brw_send_indirect_split_message(struct brw_codegen *p,
2654 unsigned sfid,
2655 struct brw_reg dst,
2656 struct brw_reg payload0,
2657 struct brw_reg payload1,
2658 struct brw_reg desc,
2659 unsigned desc_imm,
2660 struct brw_reg ex_desc,
2661 unsigned ex_desc_imm,
2662 bool eot)
2663 {
2664 const struct gen_device_info *devinfo = p->devinfo;
2665 struct brw_inst *send;
2666
2667 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2668
2669 assert(desc.type == BRW_REGISTER_TYPE_UD);
2670
2671 if (desc.file == BRW_IMMEDIATE_VALUE) {
2672 desc.ud |= desc_imm;
2673 } else {
2674 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2675 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2676
2677 brw_push_insn_state(p);
2678 brw_set_default_access_mode(p, BRW_ALIGN_1);
2679 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2680 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2681 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2682 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2683
2684 /* Load the indirect descriptor to an address register using OR so the
2685 * caller can specify additional descriptor bits with the desc_imm
2686 * immediate.
2687 */
2688 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2689
2690 brw_pop_insn_state(p);
2691 desc = addr;
2692
2693 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2694 }
2695
2696 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2697 (ex_desc.ud & INTEL_MASK(15, 12)) == 0) {
2698 ex_desc.ud |= ex_desc_imm;
2699 } else {
2700 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2701 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2702
2703 brw_push_insn_state(p);
2704 brw_set_default_access_mode(p, BRW_ALIGN_1);
2705 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2706 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2707 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2708 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2709
2710 /* Load the indirect extended descriptor to an address register using OR
2711 * so the caller can specify additional descriptor bits with the
2712 * desc_imm immediate.
2713 *
2714 * Even though the instruction dispatcher always pulls the SFID and EOT
2715 * fields from the instruction itself, actual external unit which
2716 * processes the message gets the SFID and EOT from the extended
2717 * descriptor which comes from the address register. If we don't OR
2718 * those two bits in, the external unit may get confused and hang.
2719 */
2720 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2721
2722 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2723 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2724 * we may have fallen back to an indirect extended descriptor.
2725 */
2726 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2727 } else {
2728 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2729 }
2730
2731 brw_pop_insn_state(p);
2732 ex_desc = addr;
2733
2734 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2735 }
2736
2737 send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2738 brw_set_dest(p, send, dst);
2739 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2740 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2741
2742 if (desc.file == BRW_IMMEDIATE_VALUE) {
2743 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2744 brw_inst_set_send_desc(devinfo, send, desc.ud);
2745 } else {
2746 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2747 assert(desc.nr == BRW_ARF_ADDRESS);
2748 assert(desc.subnr == 0);
2749 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2750 }
2751
2752 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2753 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2754 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2755 } else {
2756 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2757 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2758 assert((ex_desc.subnr & 0x3) == 0);
2759 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2760 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2761 }
2762
2763 brw_inst_set_sfid(devinfo, send, sfid);
2764 brw_inst_set_eot(devinfo, send, eot);
2765 }
2766
2767 static void
2768 brw_send_indirect_surface_message(struct brw_codegen *p,
2769 unsigned sfid,
2770 struct brw_reg dst,
2771 struct brw_reg payload,
2772 struct brw_reg surface,
2773 unsigned desc_imm)
2774 {
2775 if (surface.file != BRW_IMMEDIATE_VALUE) {
2776 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2777 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2778
2779 brw_push_insn_state(p);
2780 brw_set_default_access_mode(p, BRW_ALIGN_1);
2781 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2782 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2783 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2784 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2785
2786 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2787 * some surface array is accessed out of bounds.
2788 */
2789 brw_AND(p, addr,
2790 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2791 BRW_GET_SWZ(surface.swizzle, 0)),
2792 brw_imm_ud(0xff));
2793
2794 brw_pop_insn_state(p);
2795
2796 surface = addr;
2797 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2798 }
2799
2800 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2801 }
2802
2803 static bool
2804 while_jumps_before_offset(const struct gen_device_info *devinfo,
2805 brw_inst *insn, int while_offset, int start_offset)
2806 {
2807 int scale = 16 / brw_jump_scale(devinfo);
2808 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2809 : brw_inst_jip(devinfo, insn);
2810 assert(jip < 0);
2811 return while_offset + jip * scale <= start_offset;
2812 }
2813
2814
2815 static int
2816 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2817 {
2818 int offset;
2819 void *store = p->store;
2820 const struct gen_device_info *devinfo = p->devinfo;
2821
2822 int depth = 0;
2823
2824 for (offset = next_offset(devinfo, store, start_offset);
2825 offset < p->next_insn_offset;
2826 offset = next_offset(devinfo, store, offset)) {
2827 brw_inst *insn = store + offset;
2828
2829 switch (brw_inst_opcode(devinfo, insn)) {
2830 case BRW_OPCODE_IF:
2831 depth++;
2832 break;
2833 case BRW_OPCODE_ENDIF:
2834 if (depth == 0)
2835 return offset;
2836 depth--;
2837 break;
2838 case BRW_OPCODE_WHILE:
2839 /* If the while doesn't jump before our instruction, it's the end
2840 * of a sibling do...while loop. Ignore it.
2841 */
2842 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2843 continue;
2844 /* fallthrough */
2845 case BRW_OPCODE_ELSE:
2846 case BRW_OPCODE_HALT:
2847 if (depth == 0)
2848 return offset;
2849 default:
2850 break;
2851 }
2852 }
2853
2854 return 0;
2855 }
2856
2857 /* There is no DO instruction on gen6, so to find the end of the loop
2858 * we have to see if the loop is jumping back before our start
2859 * instruction.
2860 */
2861 static int
2862 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2863 {
2864 const struct gen_device_info *devinfo = p->devinfo;
2865 int offset;
2866 void *store = p->store;
2867
2868 assert(devinfo->gen >= 6);
2869
2870 /* Always start after the instruction (such as a WHILE) we're trying to fix
2871 * up.
2872 */
2873 for (offset = next_offset(devinfo, store, start_offset);
2874 offset < p->next_insn_offset;
2875 offset = next_offset(devinfo, store, offset)) {
2876 brw_inst *insn = store + offset;
2877
2878 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2879 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2880 return offset;
2881 }
2882 }
2883 assert(!"not reached");
2884 return start_offset;
2885 }
2886
2887 /* After program generation, go back and update the UIP and JIP of
2888 * BREAK, CONT, and HALT instructions to their correct locations.
2889 */
2890 void
2891 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2892 {
2893 const struct gen_device_info *devinfo = p->devinfo;
2894 int offset;
2895 int br = brw_jump_scale(devinfo);
2896 int scale = 16 / br;
2897 void *store = p->store;
2898
2899 if (devinfo->gen < 6)
2900 return;
2901
2902 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2903 brw_inst *insn = store + offset;
2904 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2905
2906 int block_end_offset = brw_find_next_block_end(p, offset);
2907 switch (brw_inst_opcode(devinfo, insn)) {
2908 case BRW_OPCODE_BREAK:
2909 assert(block_end_offset != 0);
2910 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2911 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2912 brw_inst_set_uip(devinfo, insn,
2913 (brw_find_loop_end(p, offset) - offset +
2914 (devinfo->gen == 6 ? 16 : 0)) / scale);
2915 break;
2916 case BRW_OPCODE_CONTINUE:
2917 assert(block_end_offset != 0);
2918 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2919 brw_inst_set_uip(devinfo, insn,
2920 (brw_find_loop_end(p, offset) - offset) / scale);
2921
2922 assert(brw_inst_uip(devinfo, insn) != 0);
2923 assert(brw_inst_jip(devinfo, insn) != 0);
2924 break;
2925
2926 case BRW_OPCODE_ENDIF: {
2927 int32_t jump = (block_end_offset == 0) ?
2928 1 * br : (block_end_offset - offset) / scale;
2929 if (devinfo->gen >= 7)
2930 brw_inst_set_jip(devinfo, insn, jump);
2931 else
2932 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2933 break;
2934 }
2935
2936 case BRW_OPCODE_HALT:
2937 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2938 *
2939 * "In case of the halt instruction not inside any conditional
2940 * code block, the value of <JIP> and <UIP> should be the
2941 * same. In case of the halt instruction inside conditional code
2942 * block, the <UIP> should be the end of the program, and the
2943 * <JIP> should be end of the most inner conditional code block."
2944 *
2945 * The uip will have already been set by whoever set up the
2946 * instruction.
2947 */
2948 if (block_end_offset == 0) {
2949 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2950 } else {
2951 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2952 }
2953 assert(brw_inst_uip(devinfo, insn) != 0);
2954 assert(brw_inst_jip(devinfo, insn) != 0);
2955 break;
2956
2957 default:
2958 break;
2959 }
2960 }
2961 }
2962
2963 void brw_ff_sync(struct brw_codegen *p,
2964 struct brw_reg dest,
2965 unsigned msg_reg_nr,
2966 struct brw_reg src0,
2967 bool allocate,
2968 unsigned response_length,
2969 bool eot)
2970 {
2971 const struct gen_device_info *devinfo = p->devinfo;
2972 brw_inst *insn;
2973
2974 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2975
2976 insn = next_insn(p, BRW_OPCODE_SEND);
2977 brw_set_dest(p, insn, dest);
2978 brw_set_src0(p, insn, src0);
2979 brw_set_src1(p, insn, brw_imm_d(0));
2980
2981 if (devinfo->gen < 6)
2982 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2983
2984 brw_set_ff_sync_message(p,
2985 insn,
2986 allocate,
2987 response_length,
2988 eot);
2989 }
2990
2991 /**
2992 * Emit the SEND instruction necessary to generate stream output data on Gen6
2993 * (for transform feedback).
2994 *
2995 * If send_commit_msg is true, this is the last piece of stream output data
2996 * from this thread, so send the data as a committed write. According to the
2997 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2998 *
2999 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3000 * writes are complete by sending the final write as a committed write."
3001 */
3002 void
3003 brw_svb_write(struct brw_codegen *p,
3004 struct brw_reg dest,
3005 unsigned msg_reg_nr,
3006 struct brw_reg src0,
3007 unsigned binding_table_index,
3008 bool send_commit_msg)
3009 {
3010 const struct gen_device_info *devinfo = p->devinfo;
3011 const unsigned target_cache =
3012 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
3013 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
3014 BRW_SFID_DATAPORT_WRITE);
3015 brw_inst *insn;
3016
3017 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
3018
3019 insn = next_insn(p, BRW_OPCODE_SEND);
3020 brw_inst_set_sfid(devinfo, insn, target_cache);
3021 brw_set_dest(p, insn, dest);
3022 brw_set_src0(p, insn, src0);
3023 brw_set_desc(p, insn,
3024 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3025 brw_dp_write_desc(devinfo, binding_table_index,
3026 0, /* msg_control: ignored */
3027 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3028 0, /* last_render_target: ignored */
3029 send_commit_msg)); /* send_commit_msg */
3030 }
3031
3032 static unsigned
3033 brw_surface_payload_size(struct brw_codegen *p,
3034 unsigned num_channels,
3035 unsigned exec_size /**< 0 for SIMD4x2 */)
3036 {
3037 if (exec_size == 0)
3038 return 1; /* SIMD4x2 */
3039 else if (exec_size <= 8)
3040 return num_channels;
3041 else
3042 return 2 * num_channels;
3043 }
3044
3045 void
3046 brw_untyped_atomic(struct brw_codegen *p,
3047 struct brw_reg dst,
3048 struct brw_reg payload,
3049 struct brw_reg surface,
3050 unsigned atomic_op,
3051 unsigned msg_length,
3052 bool response_expected,
3053 bool header_present)
3054 {
3055 const struct gen_device_info *devinfo = p->devinfo;
3056 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3057 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3058 GEN7_SFID_DATAPORT_DATA_CACHE);
3059 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3060 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3061 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3062 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3063 has_simd4x2 ? 0 : 8;
3064 const unsigned response_length =
3065 brw_surface_payload_size(p, response_expected, exec_size);
3066 const unsigned desc =
3067 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3068 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3069 response_expected);
3070 /* Mask out unused components -- This is especially important in Align16
3071 * mode on generations that don't have native support for SIMD4x2 atomics,
3072 * because unused but enabled components will cause the dataport to perform
3073 * additional atomic operations on the addresses that happen to be in the
3074 * uninitialized Y, Z and W coordinates of the payload.
3075 */
3076 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3077
3078 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3079 payload, surface, desc);
3080 }
3081
3082 void
3083 brw_untyped_surface_read(struct brw_codegen *p,
3084 struct brw_reg dst,
3085 struct brw_reg payload,
3086 struct brw_reg surface,
3087 unsigned msg_length,
3088 unsigned num_channels)
3089 {
3090 const struct gen_device_info *devinfo = p->devinfo;
3091 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3092 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3093 GEN7_SFID_DATAPORT_DATA_CACHE);
3094 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3095 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3096 const unsigned response_length =
3097 brw_surface_payload_size(p, num_channels, exec_size);
3098 const unsigned desc =
3099 brw_message_desc(devinfo, msg_length, response_length, false) |
3100 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3101
3102 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3103 }
3104
3105 void
3106 brw_untyped_surface_write(struct brw_codegen *p,
3107 struct brw_reg payload,
3108 struct brw_reg surface,
3109 unsigned msg_length,
3110 unsigned num_channels,
3111 bool header_present)
3112 {
3113 const struct gen_device_info *devinfo = p->devinfo;
3114 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3115 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3116 GEN7_SFID_DATAPORT_DATA_CACHE);
3117 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3118 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3119 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3120 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3121 has_simd4x2 ? 0 : 8;
3122 const unsigned desc =
3123 brw_message_desc(devinfo, msg_length, 0, header_present) |
3124 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3125 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3126 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3127
3128 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3129 payload, surface, desc);
3130 }
3131
3132 static void
3133 brw_set_memory_fence_message(struct brw_codegen *p,
3134 struct brw_inst *insn,
3135 enum brw_message_target sfid,
3136 bool commit_enable,
3137 unsigned bti)
3138 {
3139 const struct gen_device_info *devinfo = p->devinfo;
3140
3141 brw_set_desc(p, insn, brw_message_desc(
3142 devinfo, 1, (commit_enable ? 1 : 0), true));
3143
3144 brw_inst_set_sfid(devinfo, insn, sfid);
3145
3146 switch (sfid) {
3147 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3148 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3149 break;
3150 case GEN7_SFID_DATAPORT_DATA_CACHE:
3151 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3152 break;
3153 default:
3154 unreachable("Not reached");
3155 }
3156
3157 if (commit_enable)
3158 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3159
3160 assert(devinfo->gen >= 11 || bti == 0);
3161 brw_inst_set_binding_table_index(devinfo, insn, bti);
3162 }
3163
3164 void
3165 brw_memory_fence(struct brw_codegen *p,
3166 struct brw_reg dst,
3167 struct brw_reg src,
3168 enum opcode send_op,
3169 bool stall,
3170 unsigned bti)
3171 {
3172 const struct gen_device_info *devinfo = p->devinfo;
3173 const bool commit_enable = stall ||
3174 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3175 (devinfo->gen == 7 && !devinfo->is_haswell);
3176 struct brw_inst *insn;
3177
3178 brw_push_insn_state(p);
3179 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3180 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3181 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3182 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3183
3184 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3185 * message doesn't write anything back.
3186 */
3187 insn = next_insn(p, send_op);
3188 brw_set_dest(p, insn, dst);
3189 brw_set_src0(p, insn, src);
3190 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3191 commit_enable, bti);
3192
3193 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3194 /* IVB does typed surface access through the render cache, so we need to
3195 * flush it too. Use a different register so both flushes can be
3196 * pipelined by the hardware.
3197 */
3198 insn = next_insn(p, send_op);
3199 brw_set_dest(p, insn, offset(dst, 1));
3200 brw_set_src0(p, insn, src);
3201 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3202 commit_enable, bti);
3203
3204 /* Now write the response of the second message into the response of the
3205 * first to trigger a pipeline stall -- This way future render and data
3206 * cache messages will be properly ordered with respect to past data and
3207 * render cache messages.
3208 */
3209 brw_MOV(p, dst, offset(dst, 1));
3210 }
3211
3212 if (stall) {
3213 brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_DST,
3214 brw_get_default_swsb(p).sbid));
3215
3216 brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
3217 }
3218
3219 brw_pop_insn_state(p);
3220 }
3221
3222 void
3223 brw_pixel_interpolator_query(struct brw_codegen *p,
3224 struct brw_reg dest,
3225 struct brw_reg mrf,
3226 bool noperspective,
3227 unsigned mode,
3228 struct brw_reg data,
3229 unsigned msg_length,
3230 unsigned response_length)
3231 {
3232 const struct gen_device_info *devinfo = p->devinfo;
3233 const uint16_t exec_size = brw_get_default_exec_size(p);
3234 const unsigned slot_group = brw_get_default_group(p) / 16;
3235 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3236 const unsigned desc =
3237 brw_message_desc(devinfo, msg_length, response_length, false) |
3238 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3239 slot_group);
3240
3241 /* brw_send_indirect_message will automatically use a direct send message
3242 * if data is actually immediate.
3243 */
3244 brw_send_indirect_message(p,
3245 GEN7_SFID_PIXEL_INTERPOLATOR,
3246 dest,
3247 mrf,
3248 vec1(data),
3249 desc,
3250 false);
3251 }
3252
3253 void
3254 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3255 struct brw_reg mask)
3256 {
3257 const struct gen_device_info *devinfo = p->devinfo;
3258 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3259 const unsigned qtr_control = brw_get_default_group(p) / 8;
3260 brw_inst *inst;
3261
3262 assert(devinfo->gen >= 7);
3263 assert(mask.type == BRW_REGISTER_TYPE_UD);
3264
3265 brw_push_insn_state(p);
3266
3267 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3268 * unnecessary bits in the instruction words, get the information we need
3269 * and reset the default flag register. This allows more instructions to be
3270 * compacted.
3271 */
3272 const unsigned flag_subreg = p->current->flag_subreg;
3273 brw_set_default_flag_reg(p, 0, 0);
3274
3275 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3276 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3277
3278 if (devinfo->gen >= 8) {
3279 /* Getting the first active channel index is easy on Gen8: Just find
3280 * the first bit set in the execution mask. The register exists on
3281 * HSW already but it reads back as all ones when the current
3282 * instruction has execution masking disabled, so it's kind of
3283 * useless.
3284 */
3285 struct brw_reg exec_mask =
3286 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3287
3288 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3289 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3290 /* Unfortunately, ce0 does not take into account the thread
3291 * dispatch mask, which may be a problem in cases where it's not
3292 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3293 * some n). Combine ce0 with the given dispatch (or vector) mask
3294 * to mask off those channels which were never dispatched by the
3295 * hardware.
3296 */
3297 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3298 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3299 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3300 exec_mask = vec1(dst);
3301 }
3302
3303 /* Quarter control has the effect of magically shifting the value of
3304 * ce0 so you'll get the first active channel relative to the
3305 * specified quarter control as result.
3306 */
3307 inst = brw_FBL(p, vec1(dst), exec_mask);
3308 } else {
3309 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3310
3311 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3312 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3313
3314 /* Run enough instructions returning zero with execution masking and
3315 * a conditional modifier enabled in order to get the full execution
3316 * mask in f1.0. We could use a single 32-wide move here if it
3317 * weren't because of the hardware bug that causes channel enables to
3318 * be applied incorrectly to the second half of 32-wide instructions
3319 * on Gen7.
3320 */
3321 const unsigned lower_size = MIN2(16, exec_size);
3322 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3323 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3324 brw_imm_uw(0));
3325 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3326 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3327 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3328 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3329 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3330 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3331 }
3332
3333 /* Find the first bit set in the exec_size-wide portion of the flag
3334 * register that was updated by the last sequence of MOV
3335 * instructions.
3336 */
3337 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3338 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3339 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3340 }
3341 } else {
3342 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3343
3344 if (devinfo->gen >= 8 &&
3345 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3346 /* In SIMD4x2 mode the first active channel index is just the
3347 * negation of the first bit of the mask register. Note that ce0
3348 * doesn't take into account the dispatch mask, so the Gen7 path
3349 * should be used instead unless you have the guarantee that the
3350 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3351 * for some n).
3352 */
3353 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3354 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3355 brw_imm_ud(1));
3356
3357 } else {
3358 /* Overwrite the destination without and with execution masking to
3359 * find out which of the channels is active.
3360 */
3361 brw_push_insn_state(p);
3362 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3363 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3364 brw_imm_ud(1));
3365
3366 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3367 brw_imm_ud(0));
3368 brw_pop_insn_state(p);
3369 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3370 }
3371 }
3372
3373 brw_pop_insn_state(p);
3374 }
3375
3376 void
3377 brw_broadcast(struct brw_codegen *p,
3378 struct brw_reg dst,
3379 struct brw_reg src,
3380 struct brw_reg idx)
3381 {
3382 const struct gen_device_info *devinfo = p->devinfo;
3383 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3384 brw_inst *inst;
3385
3386 brw_push_insn_state(p);
3387 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3388 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3389
3390 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3391 src.address_mode == BRW_ADDRESS_DIRECT);
3392 assert(!src.abs && !src.negate);
3393 assert(src.type == dst.type);
3394
3395 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3396 idx.file == BRW_IMMEDIATE_VALUE) {
3397 /* Trivial, the source is already uniform or the index is a constant.
3398 * We will typically not get here if the optimizer is doing its job, but
3399 * asserting would be mean.
3400 */
3401 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3402 brw_MOV(p, dst,
3403 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3404 stride(suboffset(src, 4 * i), 0, 4, 1)));
3405 } else {
3406 /* From the Haswell PRM section "Register Region Restrictions":
3407 *
3408 * "The lower bits of the AddressImmediate must not overflow to
3409 * change the register address. The lower 5 bits of Address
3410 * Immediate when added to lower 5 bits of address register gives
3411 * the sub-register offset. The upper bits of Address Immediate
3412 * when added to upper bits of address register gives the register
3413 * address. Any overflow from sub-register offset is dropped."
3414 *
3415 * Fortunately, for broadcast, we never have a sub-register offset so
3416 * this isn't an issue.
3417 */
3418 assert(src.subnr == 0);
3419
3420 if (align1) {
3421 const struct brw_reg addr =
3422 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3423 unsigned offset = src.nr * REG_SIZE + src.subnr;
3424 /* Limit in bytes of the signed indirect addressing immediate. */
3425 const unsigned limit = 512;
3426
3427 brw_push_insn_state(p);
3428 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3429 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3430
3431 /* Take into account the component size and horizontal stride. */
3432 assert(src.vstride == src.hstride + src.width);
3433 brw_SHL(p, addr, vec1(idx),
3434 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3435 src.hstride - 1));
3436
3437 /* We can only address up to limit bytes using the indirect
3438 * addressing immediate, account for the difference if the source
3439 * register is above this limit.
3440 */
3441 if (offset >= limit) {
3442 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3443 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3444 offset = offset % limit;
3445 }
3446
3447 brw_pop_insn_state(p);
3448
3449 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3450
3451 /* Use indirect addressing to fetch the specified component. */
3452 if (type_sz(src.type) > 4 &&
3453 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3454 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3455 *
3456 * "When source or destination datatype is 64b or operation is
3457 * integer DWord multiply, indirect addressing must not be
3458 * used."
3459 *
3460 * To work around both of this issue, we do two integer MOVs
3461 * insead of one 64-bit MOV. Because no double value should ever
3462 * cross a register boundary, it's safe to use the immediate
3463 * offset in the indirect here to handle adding 4 bytes to the
3464 * offset and avoid the extra ADD to the register file.
3465 */
3466 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3467 retype(brw_vec1_indirect(addr.subnr, offset),
3468 BRW_REGISTER_TYPE_D));
3469 brw_set_default_swsb(p, tgl_swsb_null());
3470 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3471 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3472 BRW_REGISTER_TYPE_D));
3473 } else {
3474 brw_MOV(p, dst,
3475 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3476 }
3477 } else {
3478 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3479 * to all bits of a flag register,
3480 */
3481 inst = brw_MOV(p,
3482 brw_null_reg(),
3483 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3484 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3485 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3486 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3487
3488 /* and use predicated SEL to pick the right channel. */
3489 inst = brw_SEL(p, dst,
3490 stride(suboffset(src, 4), 4, 4, 1),
3491 stride(src, 4, 4, 1));
3492 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3493 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3494 }
3495 }
3496
3497 brw_pop_insn_state(p);
3498 }
3499
3500 /**
3501 * This instruction is generated as a single-channel align1 instruction by
3502 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3503 *
3504 * We can't use the typed atomic op in the FS because that has the execution
3505 * mask ANDed with the pixel mask, but we just want to write the one dword for
3506 * all the pixels.
3507 *
3508 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3509 * one u32. So we use the same untyped atomic write message as the pixel
3510 * shader.
3511 *
3512 * The untyped atomic operation requires a BUFFER surface type with RAW
3513 * format, and is only accessible through the legacy DATA_CACHE dataport
3514 * messages.
3515 */
3516 void brw_shader_time_add(struct brw_codegen *p,
3517 struct brw_reg payload,
3518 uint32_t surf_index)
3519 {
3520 const struct gen_device_info *devinfo = p->devinfo;
3521 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3522 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3523 GEN7_SFID_DATAPORT_DATA_CACHE);
3524 assert(devinfo->gen >= 7);
3525
3526 brw_push_insn_state(p);
3527 brw_set_default_access_mode(p, BRW_ALIGN_1);
3528 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3529 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3530 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3531
3532 /* We use brw_vec1_reg and unmasked because we want to increment the given
3533 * offset only once.
3534 */
3535 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3536 BRW_ARF_NULL, 0));
3537 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3538 payload.nr, 0));
3539 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3540 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3541 false)));
3542
3543 brw_inst_set_sfid(devinfo, send, sfid);
3544 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3545
3546 brw_pop_insn_state(p);
3547 }
3548
3549
3550 /**
3551 * Emit the SEND message for a barrier
3552 */
3553 void
3554 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3555 {
3556 const struct gen_device_info *devinfo = p->devinfo;
3557 struct brw_inst *inst;
3558
3559 assert(devinfo->gen >= 7);
3560
3561 brw_push_insn_state(p);
3562 brw_set_default_access_mode(p, BRW_ALIGN_1);
3563 inst = next_insn(p, BRW_OPCODE_SEND);
3564 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3565 brw_set_src0(p, inst, src);
3566 brw_set_src1(p, inst, brw_null_reg());
3567 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3568
3569 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3570 brw_inst_set_gateway_subfuncid(devinfo, inst,
3571 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3572
3573 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3574 brw_pop_insn_state(p);
3575 }
3576
3577
3578 /**
3579 * Emit the wait instruction for a barrier
3580 */
3581 void
3582 brw_WAIT(struct brw_codegen *p)
3583 {
3584 const struct gen_device_info *devinfo = p->devinfo;
3585 struct brw_inst *insn;
3586
3587 struct brw_reg src = brw_notification_reg();
3588
3589 insn = next_insn(p, BRW_OPCODE_WAIT);
3590 brw_set_dest(p, insn, src);
3591 brw_set_src0(p, insn, src);
3592 brw_set_src1(p, insn, brw_null_reg());
3593
3594 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3595 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3596 }
3597
3598 void
3599 brw_float_controls_mode(struct brw_codegen *p,
3600 unsigned mode, unsigned mask)
3601 {
3602 /* From the Skylake PRM, Volume 7, page 760:
3603 * "Implementation Restriction on Register Access: When the control
3604 * register is used as an explicit source and/or destination, hardware
3605 * does not ensure execution pipeline coherency. Software must set the
3606 * thread control field to ‘switch’ for an instruction that uses
3607 * control register as an explicit operand."
3608 *
3609 * On Gen12+ this is implemented in terms of SWSB annotations instead.
3610 */
3611 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3612
3613 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3614 brw_imm_ud(~mask));
3615 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3616 if (p->devinfo->gen < 12)
3617 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3618
3619 if (mode) {
3620 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3621 brw_imm_ud(mode));
3622 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3623 if (p->devinfo->gen < 12)
3624 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3625 }
3626
3627 if (p->devinfo->gen >= 12)
3628 brw_SYNC(p, TGL_SYNC_NOP);
3629 }