intel/compiler: Don't change hstride if not needed
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 assert(devinfo->gen < 12);
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct gen_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 void
89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91 const struct gen_device_info *devinfo = p->devinfo;
92
93 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
95 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96 assert(dest.nr < 128);
97
98 /* The hardware has a restriction where a destination of size Byte with
99 * a stride of 1 is only allowed for a packed byte MOV. For any other
100 * instruction, the stride must be at least 2, even when the destination
101 * is the NULL register.
102 */
103 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104 dest.nr == BRW_ARF_NULL &&
105 type_sz(dest.type) == 1 &&
106 dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108 }
109
110 gen7_convert_mrf_to_grf(p, &dest);
111
112 if (devinfo->gen >= 12 &&
113 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118 assert(dest.subnr == 0);
119 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121 dest.vstride == dest.width + 1));
122 assert(!dest.negate && !dest.abs);
123 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125
126 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128 assert(devinfo->gen < 12);
129 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132 assert(dest.subnr % 16 == 0);
133 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134 dest.vstride == dest.width + 1);
135 assert(!dest.negate && !dest.abs);
136 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139 } else {
140 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142
143 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145
146 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151 } else {
152 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156 assert(dest.writemask != 0);
157 }
158 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159 * Although Dst.HorzStride is a don't care for Align16, HW needs
160 * this to be programmed as "01".
161 */
162 brw_inst_set_dst_hstride(devinfo, inst, 1);
163 }
164 } else {
165 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166
167 /* These are different sizes in align1 vs align16:
168 */
169 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171 dest.indirect_offset);
172 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175 } else {
176 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177 dest.indirect_offset);
178 /* even ignored in da16, still need to set as '01' */
179 brw_inst_set_dst_hstride(devinfo, inst, 1);
180 }
181 }
182 }
183
184 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185 * or 16 (SIMD16), as that's normally correct. However, when dealing with
186 * small registers, it can be useful for us to automatically reduce it to
187 * match the register size.
188 */
189 if (p->automatic_exec_sizes) {
190 /*
191 * In platforms that support fp64 we can emit instructions with a width
192 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193 * these cases we need to make sure that these instructions have their
194 * exec sizes set properly when they are emitted and we can't rely on
195 * this code to fix it.
196 */
197 bool fix_exec_size;
198 if (devinfo->gen >= 6)
199 fix_exec_size = dest.width < BRW_EXECUTE_4;
200 else
201 fix_exec_size = dest.width < BRW_EXECUTE_8;
202
203 if (fix_exec_size)
204 brw_inst_set_exec_size(devinfo, inst, dest.width);
205 }
206 }
207
208 void
209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211 const struct gen_device_info *devinfo = p->devinfo;
212
213 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
215 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216 assert(reg.nr < 128);
217
218 gen7_convert_mrf_to_grf(p, &reg);
219
220 if (devinfo->gen >= 6 &&
221 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225 /* Any source modifiers or regions will be ignored, since this just
226 * identifies the MRF/GRF to start reading the message contents from.
227 * Check for some likely failures.
228 */
229 assert(!reg.negate);
230 assert(!reg.abs);
231 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232 }
233
234 if (devinfo->gen >= 12 &&
235 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237 assert(reg.file != BRW_IMMEDIATE_VALUE);
238 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239 assert(reg.subnr == 0);
240 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
241 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242 reg.vstride == reg.width + 1));
243 assert(!reg.negate && !reg.abs);
244 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246
247 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251 assert(reg.subnr % 16 == 0);
252 assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
253 reg.vstride == reg.width + 1);
254 assert(!reg.negate && !reg.abs);
255 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
256 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
257 } else {
258 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
259 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
260 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
261 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
262
263 if (reg.file == BRW_IMMEDIATE_VALUE) {
264 if (reg.type == BRW_REGISTER_TYPE_DF ||
265 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
266 brw_inst_set_imm_df(devinfo, inst, reg.df);
267 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
268 reg.type == BRW_REGISTER_TYPE_Q)
269 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
270 else
271 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
272
273 if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
274 brw_inst_set_src1_reg_file(devinfo, inst,
275 BRW_ARCHITECTURE_REGISTER_FILE);
276 brw_inst_set_src1_reg_hw_type(devinfo, inst,
277 brw_inst_src0_reg_hw_type(devinfo, inst));
278 }
279 } else {
280 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
281 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
282 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
283 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
284 } else {
285 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
286 }
287 } else {
288 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
289
290 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
291 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
292 } else {
293 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
294 }
295 }
296
297 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
298 if (reg.width == BRW_WIDTH_1 &&
299 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
300 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
301 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
302 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
303 } else {
304 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
305 brw_inst_set_src0_width(devinfo, inst, reg.width);
306 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
307 }
308 } else {
309 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
310 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
311 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
312 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
313 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
314 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
315 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
316 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
317
318 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
319 /* This is an oddity of the fact we're using the same
320 * descriptions for registers in align_16 as align_1:
321 */
322 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
323 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
324 reg.type == BRW_REGISTER_TYPE_DF &&
325 reg.vstride == BRW_VERTICAL_STRIDE_2) {
326 /* From SNB PRM:
327 *
328 * "For Align16 access mode, only encodings of 0000 and 0011
329 * are allowed. Other codes are reserved."
330 *
331 * Presumably the DevSNB behavior applies to IVB as well.
332 */
333 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
334 } else {
335 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
336 }
337 }
338 }
339 }
340 }
341
342
343 void
344 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
345 {
346 const struct gen_device_info *devinfo = p->devinfo;
347
348 if (reg.file == BRW_GENERAL_REGISTER_FILE)
349 assert(reg.nr < 128);
350
351 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
352 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
353 (devinfo->gen >= 12 &&
354 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
355 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
356 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
357 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
358 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
359 assert(reg.subnr == 0);
360 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
361 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
362 reg.vstride == reg.width + 1));
363 assert(!reg.negate && !reg.abs);
364 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
365 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
366 } else {
367 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
368 *
369 * "Accumulator registers may be accessed explicitly as src0
370 * operands only."
371 */
372 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
373 reg.nr != BRW_ARF_ACCUMULATOR);
374
375 gen7_convert_mrf_to_grf(p, &reg);
376 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
377
378 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
379 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
380 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
381
382 /* Only src1 can be immediate in two-argument instructions.
383 */
384 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
385
386 if (reg.file == BRW_IMMEDIATE_VALUE) {
387 /* two-argument instructions can only use 32-bit immediates */
388 assert(type_sz(reg.type) < 8);
389 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
390 } else {
391 /* This is a hardware restriction, which may or may not be lifted
392 * in the future:
393 */
394 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
395 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
396
397 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
398 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
399 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
400 } else {
401 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
402 }
403
404 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
405 if (reg.width == BRW_WIDTH_1 &&
406 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
407 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
408 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
409 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
410 } else {
411 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
412 brw_inst_set_src1_width(devinfo, inst, reg.width);
413 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
414 }
415 } else {
416 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
417 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
418 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
419 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
420 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
421 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
422 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
423 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
424
425 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
426 /* This is an oddity of the fact we're using the same
427 * descriptions for registers in align_16 as align_1:
428 */
429 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
430 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
431 reg.type == BRW_REGISTER_TYPE_DF &&
432 reg.vstride == BRW_VERTICAL_STRIDE_2) {
433 /* From SNB PRM:
434 *
435 * "For Align16 access mode, only encodings of 0000 and 0011
436 * are allowed. Other codes are reserved."
437 *
438 * Presumably the DevSNB behavior applies to IVB as well.
439 */
440 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
441 } else {
442 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
443 }
444 }
445 }
446 }
447 }
448
449 /**
450 * Specify the descriptor and extended descriptor immediate for a SEND(C)
451 * message instruction.
452 */
453 void
454 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
455 unsigned desc, unsigned ex_desc)
456 {
457 const struct gen_device_info *devinfo = p->devinfo;
458 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
459 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
460 if (devinfo->gen < 12)
461 brw_inst_set_src1_file_type(devinfo, inst,
462 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
463 brw_inst_set_send_desc(devinfo, inst, desc);
464 if (devinfo->gen >= 9)
465 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
466 }
467
468 static void brw_set_math_message( struct brw_codegen *p,
469 brw_inst *inst,
470 unsigned function,
471 unsigned integer_type,
472 bool low_precision,
473 unsigned dataType )
474 {
475 const struct gen_device_info *devinfo = p->devinfo;
476 unsigned msg_length;
477 unsigned response_length;
478
479 /* Infer message length from the function */
480 switch (function) {
481 case BRW_MATH_FUNCTION_POW:
482 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
483 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
484 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
485 msg_length = 2;
486 break;
487 default:
488 msg_length = 1;
489 break;
490 }
491
492 /* Infer response length from the function */
493 switch (function) {
494 case BRW_MATH_FUNCTION_SINCOS:
495 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
496 response_length = 2;
497 break;
498 default:
499 response_length = 1;
500 break;
501 }
502
503 brw_set_desc(p, inst, brw_message_desc(
504 devinfo, msg_length, response_length, false));
505
506 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
507 brw_inst_set_math_msg_function(devinfo, inst, function);
508 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
509 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
510 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
511 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
512 brw_inst_set_saturate(devinfo, inst, 0);
513 }
514
515
516 static void brw_set_ff_sync_message(struct brw_codegen *p,
517 brw_inst *insn,
518 bool allocate,
519 unsigned response_length,
520 bool end_of_thread)
521 {
522 const struct gen_device_info *devinfo = p->devinfo;
523
524 brw_set_desc(p, insn, brw_message_desc(
525 devinfo, 1, response_length, true));
526
527 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
528 brw_inst_set_eot(devinfo, insn, end_of_thread);
529 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
530 brw_inst_set_urb_allocate(devinfo, insn, allocate);
531 /* The following fields are not used by FF_SYNC: */
532 brw_inst_set_urb_global_offset(devinfo, insn, 0);
533 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
534 brw_inst_set_urb_used(devinfo, insn, 0);
535 brw_inst_set_urb_complete(devinfo, insn, 0);
536 }
537
538 static void brw_set_urb_message( struct brw_codegen *p,
539 brw_inst *insn,
540 enum brw_urb_write_flags flags,
541 unsigned msg_length,
542 unsigned response_length,
543 unsigned offset,
544 unsigned swizzle_control )
545 {
546 const struct gen_device_info *devinfo = p->devinfo;
547
548 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
549 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
550 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
551
552 brw_set_desc(p, insn, brw_message_desc(
553 devinfo, msg_length, response_length, true));
554
555 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
556 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
557
558 if (flags & BRW_URB_WRITE_OWORD) {
559 assert(msg_length == 2); /* header + one OWORD of data */
560 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
561 } else {
562 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
563 }
564
565 brw_inst_set_urb_global_offset(devinfo, insn, offset);
566 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
567
568 if (devinfo->gen < 8) {
569 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
570 }
571
572 if (devinfo->gen < 7) {
573 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
574 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
575 } else {
576 brw_inst_set_urb_per_slot_offset(devinfo, insn,
577 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
578 }
579 }
580
581 static void
582 gen7_set_dp_scratch_message(struct brw_codegen *p,
583 brw_inst *inst,
584 bool write,
585 bool dword,
586 bool invalidate_after_read,
587 unsigned num_regs,
588 unsigned addr_offset,
589 unsigned mlen,
590 unsigned rlen,
591 bool header_present)
592 {
593 const struct gen_device_info *devinfo = p->devinfo;
594 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
595 (devinfo->gen >= 8 && num_regs == 8));
596 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
597 num_regs - 1);
598
599 brw_set_desc(p, inst, brw_message_desc(
600 devinfo, mlen, rlen, header_present));
601
602 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
603 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
604 brw_inst_set_scratch_read_write(devinfo, inst, write);
605 brw_inst_set_scratch_type(devinfo, inst, dword);
606 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
607 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
608 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
609 }
610
611 static void
612 brw_inst_set_state(const struct gen_device_info *devinfo,
613 brw_inst *insn,
614 const struct brw_insn_state *state)
615 {
616 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
617 brw_inst_set_group(devinfo, insn, state->group);
618 brw_inst_set_compression(devinfo, insn, state->compressed);
619 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
620 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
621 if (devinfo->gen >= 12)
622 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(state->swsb));
623 brw_inst_set_saturate(devinfo, insn, state->saturate);
624 brw_inst_set_pred_control(devinfo, insn, state->predicate);
625 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
626
627 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
628 state->access_mode == BRW_ALIGN_16) {
629 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
630 if (devinfo->gen >= 7)
631 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
632 } else {
633 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
634 if (devinfo->gen >= 7)
635 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
636 }
637
638 if (devinfo->gen >= 6)
639 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
640 }
641
642 #define next_insn brw_next_insn
643 brw_inst *
644 brw_next_insn(struct brw_codegen *p, unsigned opcode)
645 {
646 const struct gen_device_info *devinfo = p->devinfo;
647 brw_inst *insn;
648
649 if (p->nr_insn + 1 > p->store_size) {
650 p->store_size <<= 1;
651 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
652 }
653
654 p->next_insn_offset += 16;
655 insn = &p->store[p->nr_insn++];
656
657 memset(insn, 0, sizeof(*insn));
658 brw_inst_set_opcode(devinfo, insn, opcode);
659
660 /* Apply the default instruction state */
661 brw_inst_set_state(devinfo, insn, p->current);
662
663 return insn;
664 }
665
666 static brw_inst *
667 brw_alu1(struct brw_codegen *p, unsigned opcode,
668 struct brw_reg dest, struct brw_reg src)
669 {
670 brw_inst *insn = next_insn(p, opcode);
671 brw_set_dest(p, insn, dest);
672 brw_set_src0(p, insn, src);
673 return insn;
674 }
675
676 static brw_inst *
677 brw_alu2(struct brw_codegen *p, unsigned opcode,
678 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
679 {
680 /* 64-bit immediates are only supported on 1-src instructions */
681 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
682 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
683
684 brw_inst *insn = next_insn(p, opcode);
685 brw_set_dest(p, insn, dest);
686 brw_set_src0(p, insn, src0);
687 brw_set_src1(p, insn, src1);
688 return insn;
689 }
690
691 static int
692 get_3src_subreg_nr(struct brw_reg reg)
693 {
694 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
695 * use 32-bit units (components 0..7). Since they only support F/D/UD
696 * types, this doesn't lose any flexibility, but uses fewer bits.
697 */
698 return reg.subnr / 4;
699 }
700
701 static enum gen10_align1_3src_vertical_stride
702 to_3src_align1_vstride(const struct gen_device_info *devinfo,
703 enum brw_vertical_stride vstride)
704 {
705 switch (vstride) {
706 case BRW_VERTICAL_STRIDE_0:
707 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
708 case BRW_VERTICAL_STRIDE_1:
709 assert(devinfo->gen >= 12);
710 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
711 case BRW_VERTICAL_STRIDE_2:
712 assert(devinfo->gen < 12);
713 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
714 case BRW_VERTICAL_STRIDE_4:
715 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
716 case BRW_VERTICAL_STRIDE_8:
717 case BRW_VERTICAL_STRIDE_16:
718 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
719 default:
720 unreachable("invalid vstride");
721 }
722 }
723
724
725 static enum gen10_align1_3src_src_horizontal_stride
726 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
727 {
728 switch (hstride) {
729 case BRW_HORIZONTAL_STRIDE_0:
730 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
731 case BRW_HORIZONTAL_STRIDE_1:
732 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
733 case BRW_HORIZONTAL_STRIDE_2:
734 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
735 case BRW_HORIZONTAL_STRIDE_4:
736 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
737 default:
738 unreachable("invalid hstride");
739 }
740 }
741
742 static brw_inst *
743 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
744 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
745 {
746 const struct gen_device_info *devinfo = p->devinfo;
747 brw_inst *inst = next_insn(p, opcode);
748
749 gen7_convert_mrf_to_grf(p, &dest);
750
751 assert(dest.nr < 128);
752
753 if (devinfo->gen >= 10)
754 assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
755 src2.file == BRW_IMMEDIATE_VALUE));
756
757 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
758 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
759 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
760 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
761 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
762 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
763 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
764
765 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
766 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
767 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
768
769 if (devinfo->gen >= 12) {
770 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
771 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
772 } else {
773 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
774 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
775 BRW_ALIGN1_3SRC_ACCUMULATOR);
776 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
777 } else {
778 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
779 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
780 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
781 }
782 }
783 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
784
785 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
786
787 if (brw_reg_type_is_floating_point(dest.type)) {
788 brw_inst_set_3src_a1_exec_type(devinfo, inst,
789 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
790 } else {
791 brw_inst_set_3src_a1_exec_type(devinfo, inst,
792 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
793 }
794
795 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
796 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
797 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
798 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
799
800 if (src0.file == BRW_IMMEDIATE_VALUE) {
801 brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
802 } else {
803 brw_inst_set_3src_a1_src0_vstride(
804 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
805 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
806 to_3src_align1_hstride(src0.hstride));
807 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
808 if (src0.type == BRW_REGISTER_TYPE_NF) {
809 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
810 } else {
811 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
812 }
813 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
814 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
815 }
816 brw_inst_set_3src_a1_src1_vstride(
817 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
818 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
819 to_3src_align1_hstride(src1.hstride));
820
821 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
822 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
823 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
824 } else {
825 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
826 }
827 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
828 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
829
830 if (src2.file == BRW_IMMEDIATE_VALUE) {
831 brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
832 } else {
833 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
834 to_3src_align1_hstride(src2.hstride));
835 /* no vstride on src2 */
836 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
837 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
838 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
839 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
840 }
841
842 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
843 src0.file == BRW_IMMEDIATE_VALUE ||
844 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
845 src0.type == BRW_REGISTER_TYPE_NF));
846 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
847 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
848 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
849 src2.file == BRW_IMMEDIATE_VALUE);
850
851 if (devinfo->gen >= 12) {
852 if (src0.file == BRW_IMMEDIATE_VALUE) {
853 brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
854 } else {
855 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
856 }
857
858 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
859
860 if (src2.file == BRW_IMMEDIATE_VALUE) {
861 brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
862 } else {
863 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
864 }
865 } else {
866 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
867 src0.file == BRW_GENERAL_REGISTER_FILE ?
868 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
869 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
870 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
871 src1.file == BRW_GENERAL_REGISTER_FILE ?
872 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
873 BRW_ALIGN1_3SRC_ACCUMULATOR);
874 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
875 src2.file == BRW_GENERAL_REGISTER_FILE ?
876 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
877 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
878 }
879
880 } else {
881 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
882 dest.file == BRW_MESSAGE_REGISTER_FILE);
883 assert(dest.type == BRW_REGISTER_TYPE_F ||
884 dest.type == BRW_REGISTER_TYPE_DF ||
885 dest.type == BRW_REGISTER_TYPE_D ||
886 dest.type == BRW_REGISTER_TYPE_UD ||
887 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
888 if (devinfo->gen == 6) {
889 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
890 dest.file == BRW_MESSAGE_REGISTER_FILE);
891 }
892 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
893 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
894 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
895
896 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
897 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
898 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
899 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
900 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
901 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
902 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
903 src0.vstride == BRW_VERTICAL_STRIDE_0);
904
905 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
906 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
907 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
908 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
909 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
910 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
911 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
912 src1.vstride == BRW_VERTICAL_STRIDE_0);
913
914 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
915 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
916 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
917 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
918 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
919 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
920 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
921 src2.vstride == BRW_VERTICAL_STRIDE_0);
922
923 if (devinfo->gen >= 7) {
924 /* Set both the source and destination types based on dest.type,
925 * ignoring the source register types. The MAD and LRP emitters ensure
926 * that all four types are float. The BFE and BFI2 emitters, however,
927 * may send us mixed D and UD types and want us to ignore that and use
928 * the destination type.
929 */
930 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
931 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
932
933 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
934 *
935 * "Three source instructions can use operands with mixed-mode
936 * precision. When SrcType field is set to :f or :hf it defines
937 * precision for source 0 only, and fields Src1Type and Src2Type
938 * define precision for other source operands:
939 *
940 * 0b = :f. Single precision Float (32-bit).
941 * 1b = :hf. Half precision Float (16-bit)."
942 */
943 if (src1.type == BRW_REGISTER_TYPE_HF)
944 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
945
946 if (src2.type == BRW_REGISTER_TYPE_HF)
947 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
948 }
949 }
950
951 return inst;
952 }
953
954
955 /***********************************************************************
956 * Convenience routines.
957 */
958 #define ALU1(OP) \
959 brw_inst *brw_##OP(struct brw_codegen *p, \
960 struct brw_reg dest, \
961 struct brw_reg src0) \
962 { \
963 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
964 }
965
966 #define ALU2(OP) \
967 brw_inst *brw_##OP(struct brw_codegen *p, \
968 struct brw_reg dest, \
969 struct brw_reg src0, \
970 struct brw_reg src1) \
971 { \
972 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
973 }
974
975 #define ALU3(OP) \
976 brw_inst *brw_##OP(struct brw_codegen *p, \
977 struct brw_reg dest, \
978 struct brw_reg src0, \
979 struct brw_reg src1, \
980 struct brw_reg src2) \
981 { \
982 if (p->current->access_mode == BRW_ALIGN_16) { \
983 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
984 src0.swizzle = BRW_SWIZZLE_XXXX; \
985 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
986 src1.swizzle = BRW_SWIZZLE_XXXX; \
987 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
988 src2.swizzle = BRW_SWIZZLE_XXXX; \
989 } \
990 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
991 }
992
993 #define ALU3F(OP) \
994 brw_inst *brw_##OP(struct brw_codegen *p, \
995 struct brw_reg dest, \
996 struct brw_reg src0, \
997 struct brw_reg src1, \
998 struct brw_reg src2) \
999 { \
1000 assert(dest.type == BRW_REGISTER_TYPE_F || \
1001 dest.type == BRW_REGISTER_TYPE_DF); \
1002 if (dest.type == BRW_REGISTER_TYPE_F) { \
1003 assert(src0.type == BRW_REGISTER_TYPE_F); \
1004 assert(src1.type == BRW_REGISTER_TYPE_F); \
1005 assert(src2.type == BRW_REGISTER_TYPE_F); \
1006 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1007 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1008 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1009 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1010 } \
1011 \
1012 if (p->current->access_mode == BRW_ALIGN_16) { \
1013 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1014 src0.swizzle = BRW_SWIZZLE_XXXX; \
1015 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1016 src1.swizzle = BRW_SWIZZLE_XXXX; \
1017 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1018 src2.swizzle = BRW_SWIZZLE_XXXX; \
1019 } \
1020 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1021 }
1022
1023 /* Rounding operations (other than RNDD) require two instructions - the first
1024 * stores a rounded value (possibly the wrong way) in the dest register, but
1025 * also sets a per-channel "increment bit" in the flag register. A predicated
1026 * add of 1.0 fixes dest to contain the desired result.
1027 *
1028 * Sandybridge and later appear to round correctly without an ADD.
1029 */
1030 #define ROUND(OP) \
1031 void brw_##OP(struct brw_codegen *p, \
1032 struct brw_reg dest, \
1033 struct brw_reg src) \
1034 { \
1035 const struct gen_device_info *devinfo = p->devinfo; \
1036 brw_inst *rnd, *add; \
1037 rnd = next_insn(p, BRW_OPCODE_##OP); \
1038 brw_set_dest(p, rnd, dest); \
1039 brw_set_src0(p, rnd, src); \
1040 \
1041 if (devinfo->gen < 6) { \
1042 /* turn on round-increments */ \
1043 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1044 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1045 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1046 } \
1047 }
1048
1049
1050 ALU2(SEL)
1051 ALU1(NOT)
1052 ALU2(AND)
1053 ALU2(OR)
1054 ALU2(XOR)
1055 ALU2(SHR)
1056 ALU2(SHL)
1057 ALU1(DIM)
1058 ALU2(ASR)
1059 ALU2(ROL)
1060 ALU2(ROR)
1061 ALU3(CSEL)
1062 ALU1(FRC)
1063 ALU1(RNDD)
1064 ALU2(MAC)
1065 ALU2(MACH)
1066 ALU1(LZD)
1067 ALU2(DP4)
1068 ALU2(DPH)
1069 ALU2(DP3)
1070 ALU2(DP2)
1071 ALU3(MAD)
1072 ALU3F(LRP)
1073 ALU1(BFREV)
1074 ALU3(BFE)
1075 ALU2(BFI1)
1076 ALU3(BFI2)
1077 ALU1(FBH)
1078 ALU1(FBL)
1079 ALU1(CBIT)
1080 ALU2(ADDC)
1081 ALU2(SUBB)
1082
1083 ROUND(RNDZ)
1084 ROUND(RNDE)
1085
1086 brw_inst *
1087 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1088 {
1089 const struct gen_device_info *devinfo = p->devinfo;
1090
1091 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1092 * To avoid the problems that causes, we use an <X,2,0> source region to
1093 * read each element twice.
1094 */
1095 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1096 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1097 dest.type == BRW_REGISTER_TYPE_DF &&
1098 (src0.type == BRW_REGISTER_TYPE_F ||
1099 src0.type == BRW_REGISTER_TYPE_D ||
1100 src0.type == BRW_REGISTER_TYPE_UD) &&
1101 !has_scalar_region(src0)) {
1102 assert(src0.vstride == src0.width + src0.hstride);
1103 src0.vstride = src0.hstride;
1104 src0.width = BRW_WIDTH_2;
1105 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1106 }
1107
1108 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1109 }
1110
1111 brw_inst *
1112 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1113 struct brw_reg src0, struct brw_reg src1)
1114 {
1115 /* 6.2.2: add */
1116 if (src0.type == BRW_REGISTER_TYPE_F ||
1117 (src0.file == BRW_IMMEDIATE_VALUE &&
1118 src0.type == BRW_REGISTER_TYPE_VF)) {
1119 assert(src1.type != BRW_REGISTER_TYPE_UD);
1120 assert(src1.type != BRW_REGISTER_TYPE_D);
1121 }
1122
1123 if (src1.type == BRW_REGISTER_TYPE_F ||
1124 (src1.file == BRW_IMMEDIATE_VALUE &&
1125 src1.type == BRW_REGISTER_TYPE_VF)) {
1126 assert(src0.type != BRW_REGISTER_TYPE_UD);
1127 assert(src0.type != BRW_REGISTER_TYPE_D);
1128 }
1129
1130 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1131 }
1132
1133 brw_inst *
1134 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1135 struct brw_reg src0, struct brw_reg src1)
1136 {
1137 assert(dest.type == src0.type);
1138 assert(src0.type == src1.type);
1139 switch (src0.type) {
1140 case BRW_REGISTER_TYPE_B:
1141 case BRW_REGISTER_TYPE_UB:
1142 case BRW_REGISTER_TYPE_W:
1143 case BRW_REGISTER_TYPE_UW:
1144 case BRW_REGISTER_TYPE_D:
1145 case BRW_REGISTER_TYPE_UD:
1146 break;
1147 default:
1148 unreachable("Bad type for brw_AVG");
1149 }
1150
1151 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1152 }
1153
1154 brw_inst *
1155 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1156 struct brw_reg src0, struct brw_reg src1)
1157 {
1158 /* 6.32.38: mul */
1159 if (src0.type == BRW_REGISTER_TYPE_D ||
1160 src0.type == BRW_REGISTER_TYPE_UD ||
1161 src1.type == BRW_REGISTER_TYPE_D ||
1162 src1.type == BRW_REGISTER_TYPE_UD) {
1163 assert(dest.type != BRW_REGISTER_TYPE_F);
1164 }
1165
1166 if (src0.type == BRW_REGISTER_TYPE_F ||
1167 (src0.file == BRW_IMMEDIATE_VALUE &&
1168 src0.type == BRW_REGISTER_TYPE_VF)) {
1169 assert(src1.type != BRW_REGISTER_TYPE_UD);
1170 assert(src1.type != BRW_REGISTER_TYPE_D);
1171 }
1172
1173 if (src1.type == BRW_REGISTER_TYPE_F ||
1174 (src1.file == BRW_IMMEDIATE_VALUE &&
1175 src1.type == BRW_REGISTER_TYPE_VF)) {
1176 assert(src0.type != BRW_REGISTER_TYPE_UD);
1177 assert(src0.type != BRW_REGISTER_TYPE_D);
1178 }
1179
1180 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1181 src0.nr != BRW_ARF_ACCUMULATOR);
1182 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1183 src1.nr != BRW_ARF_ACCUMULATOR);
1184
1185 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1186 }
1187
1188 brw_inst *
1189 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1190 struct brw_reg src0, struct brw_reg src1)
1191 {
1192 src0.vstride = BRW_VERTICAL_STRIDE_0;
1193 src0.width = BRW_WIDTH_1;
1194 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1195 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1196 }
1197
1198 brw_inst *
1199 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1200 struct brw_reg src0, struct brw_reg src1)
1201 {
1202 src0.vstride = BRW_VERTICAL_STRIDE_0;
1203 src0.width = BRW_WIDTH_1;
1204 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1205 src1.vstride = BRW_VERTICAL_STRIDE_8;
1206 src1.width = BRW_WIDTH_8;
1207 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1208 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1209 }
1210
1211 brw_inst *
1212 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1213 {
1214 const struct gen_device_info *devinfo = p->devinfo;
1215 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1216 /* The F32TO16 instruction doesn't support 32-bit destination types in
1217 * Align1 mode, and neither does the Gen8 implementation in terms of a
1218 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1219 * an undocumented feature.
1220 */
1221 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1222 (!align16 || devinfo->gen >= 8));
1223 brw_inst *inst;
1224
1225 if (align16) {
1226 assert(dst.type == BRW_REGISTER_TYPE_UD);
1227 } else {
1228 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1229 dst.type == BRW_REGISTER_TYPE_W ||
1230 dst.type == BRW_REGISTER_TYPE_UW ||
1231 dst.type == BRW_REGISTER_TYPE_HF);
1232 }
1233
1234 brw_push_insn_state(p);
1235
1236 if (needs_zero_fill) {
1237 brw_set_default_access_mode(p, BRW_ALIGN_1);
1238 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1239 }
1240
1241 if (devinfo->gen >= 8) {
1242 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1243 } else {
1244 assert(devinfo->gen == 7);
1245 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1246 }
1247
1248 if (needs_zero_fill) {
1249 if (devinfo->gen < 12)
1250 brw_inst_set_no_dd_clear(devinfo, inst, true);
1251 brw_set_default_swsb(p, tgl_swsb_null());
1252 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1253 if (devinfo->gen < 12)
1254 brw_inst_set_no_dd_check(devinfo, inst, true);
1255 }
1256
1257 brw_pop_insn_state(p);
1258 return inst;
1259 }
1260
1261 brw_inst *
1262 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1263 {
1264 const struct gen_device_info *devinfo = p->devinfo;
1265 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1266
1267 if (align16) {
1268 assert(src.type == BRW_REGISTER_TYPE_UD);
1269 } else {
1270 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1271 *
1272 * Because this instruction does not have a 16-bit floating-point
1273 * type, the source data type must be Word (W). The destination type
1274 * must be F (Float).
1275 */
1276 if (src.type == BRW_REGISTER_TYPE_UD)
1277 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1278
1279 assert(src.type == BRW_REGISTER_TYPE_W ||
1280 src.type == BRW_REGISTER_TYPE_UW ||
1281 src.type == BRW_REGISTER_TYPE_HF);
1282 }
1283
1284 if (devinfo->gen >= 8) {
1285 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1286 } else {
1287 assert(devinfo->gen == 7);
1288 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1289 }
1290 }
1291
1292
1293 void brw_NOP(struct brw_codegen *p)
1294 {
1295 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1296 memset(insn, 0, sizeof(*insn));
1297 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1298 }
1299
1300 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1301 {
1302 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1303 brw_inst_set_cond_modifier(p->devinfo, insn, func);
1304 }
1305
1306 /***********************************************************************
1307 * Comparisons, if/else/endif
1308 */
1309
1310 brw_inst *
1311 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1312 unsigned predicate_control)
1313 {
1314 const struct gen_device_info *devinfo = p->devinfo;
1315 struct brw_reg ip = brw_ip_reg();
1316 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1317
1318 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1319 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1320 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1321 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1322
1323 return inst;
1324 }
1325
1326 static void
1327 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1328 {
1329 p->if_stack[p->if_stack_depth] = inst - p->store;
1330
1331 p->if_stack_depth++;
1332 if (p->if_stack_array_size <= p->if_stack_depth) {
1333 p->if_stack_array_size *= 2;
1334 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1335 p->if_stack_array_size);
1336 }
1337 }
1338
1339 static brw_inst *
1340 pop_if_stack(struct brw_codegen *p)
1341 {
1342 p->if_stack_depth--;
1343 return &p->store[p->if_stack[p->if_stack_depth]];
1344 }
1345
1346 static void
1347 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1348 {
1349 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1350 p->loop_stack_array_size *= 2;
1351 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1352 p->loop_stack_array_size);
1353 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1354 p->loop_stack_array_size);
1355 }
1356
1357 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1358 p->loop_stack_depth++;
1359 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1360 }
1361
1362 static brw_inst *
1363 get_inner_do_insn(struct brw_codegen *p)
1364 {
1365 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1366 }
1367
1368 /* EU takes the value from the flag register and pushes it onto some
1369 * sort of a stack (presumably merging with any flag value already on
1370 * the stack). Within an if block, the flags at the top of the stack
1371 * control execution on each channel of the unit, eg. on each of the
1372 * 16 pixel values in our wm programs.
1373 *
1374 * When the matching 'else' instruction is reached (presumably by
1375 * countdown of the instruction count patched in by our ELSE/ENDIF
1376 * functions), the relevant flags are inverted.
1377 *
1378 * When the matching 'endif' instruction is reached, the flags are
1379 * popped off. If the stack is now empty, normal execution resumes.
1380 */
1381 brw_inst *
1382 brw_IF(struct brw_codegen *p, unsigned execute_size)
1383 {
1384 const struct gen_device_info *devinfo = p->devinfo;
1385 brw_inst *insn;
1386
1387 insn = next_insn(p, BRW_OPCODE_IF);
1388
1389 /* Override the defaults for this instruction:
1390 */
1391 if (devinfo->gen < 6) {
1392 brw_set_dest(p, insn, brw_ip_reg());
1393 brw_set_src0(p, insn, brw_ip_reg());
1394 brw_set_src1(p, insn, brw_imm_d(0x0));
1395 } else if (devinfo->gen == 6) {
1396 brw_set_dest(p, insn, brw_imm_w(0));
1397 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1398 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1399 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1400 } else if (devinfo->gen == 7) {
1401 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1402 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1403 brw_set_src1(p, insn, brw_imm_w(0));
1404 brw_inst_set_jip(devinfo, insn, 0);
1405 brw_inst_set_uip(devinfo, insn, 0);
1406 } else {
1407 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1408 if (devinfo->gen < 12)
1409 brw_set_src0(p, insn, brw_imm_d(0));
1410 brw_inst_set_jip(devinfo, insn, 0);
1411 brw_inst_set_uip(devinfo, insn, 0);
1412 }
1413
1414 brw_inst_set_exec_size(devinfo, insn, execute_size);
1415 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1416 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1417 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1418 if (!p->single_program_flow && devinfo->gen < 6)
1419 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1420
1421 push_if_stack(p, insn);
1422 p->if_depth_in_loop[p->loop_stack_depth]++;
1423 return insn;
1424 }
1425
1426 /* This function is only used for gen6-style IF instructions with an
1427 * embedded comparison (conditional modifier). It is not used on gen7.
1428 */
1429 brw_inst *
1430 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1431 struct brw_reg src0, struct brw_reg src1)
1432 {
1433 const struct gen_device_info *devinfo = p->devinfo;
1434 brw_inst *insn;
1435
1436 insn = next_insn(p, BRW_OPCODE_IF);
1437
1438 brw_set_dest(p, insn, brw_imm_w(0));
1439 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1440 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1441 brw_set_src0(p, insn, src0);
1442 brw_set_src1(p, insn, src1);
1443
1444 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1445 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1446 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1447
1448 push_if_stack(p, insn);
1449 return insn;
1450 }
1451
1452 /**
1453 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1454 */
1455 static void
1456 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1457 brw_inst *if_inst, brw_inst *else_inst)
1458 {
1459 const struct gen_device_info *devinfo = p->devinfo;
1460
1461 /* The next instruction (where the ENDIF would be, if it existed) */
1462 brw_inst *next_inst = &p->store[p->nr_insn];
1463
1464 assert(p->single_program_flow);
1465 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1466 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1467 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1468
1469 /* Convert IF to an ADD instruction that moves the instruction pointer
1470 * to the first instruction of the ELSE block. If there is no ELSE
1471 * block, point to where ENDIF would be. Reverse the predicate.
1472 *
1473 * There's no need to execute an ENDIF since we don't need to do any
1474 * stack operations, and if we're currently executing, we just want to
1475 * continue normally.
1476 */
1477 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1478 brw_inst_set_pred_inv(devinfo, if_inst, true);
1479
1480 if (else_inst != NULL) {
1481 /* Convert ELSE to an ADD instruction that points where the ENDIF
1482 * would be.
1483 */
1484 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1485
1486 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1487 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1488 } else {
1489 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1490 }
1491 }
1492
1493 /**
1494 * Patch IF and ELSE instructions with appropriate jump targets.
1495 */
1496 static void
1497 patch_IF_ELSE(struct brw_codegen *p,
1498 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1499 {
1500 const struct gen_device_info *devinfo = p->devinfo;
1501
1502 /* We shouldn't be patching IF and ELSE instructions in single program flow
1503 * mode when gen < 6, because in single program flow mode on those
1504 * platforms, we convert flow control instructions to conditional ADDs that
1505 * operate on IP (see brw_ENDIF).
1506 *
1507 * However, on Gen6, writing to IP doesn't work in single program flow mode
1508 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1509 * not be updated by non-flow control instructions."). And on later
1510 * platforms, there is no significant benefit to converting control flow
1511 * instructions to conditional ADDs. So we do patch IF and ELSE
1512 * instructions in single program flow mode on those platforms.
1513 */
1514 if (devinfo->gen < 6)
1515 assert(!p->single_program_flow);
1516
1517 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1518 assert(endif_inst != NULL);
1519 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1520
1521 unsigned br = brw_jump_scale(devinfo);
1522
1523 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1524 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1525
1526 if (else_inst == NULL) {
1527 /* Patch IF -> ENDIF */
1528 if (devinfo->gen < 6) {
1529 /* Turn it into an IFF, which means no mask stack operations for
1530 * all-false and jumping past the ENDIF.
1531 */
1532 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1533 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1534 br * (endif_inst - if_inst + 1));
1535 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1536 } else if (devinfo->gen == 6) {
1537 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1538 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1539 } else {
1540 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1541 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1542 }
1543 } else {
1544 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1545
1546 /* Patch IF -> ELSE */
1547 if (devinfo->gen < 6) {
1548 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1549 br * (else_inst - if_inst));
1550 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1551 } else if (devinfo->gen == 6) {
1552 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1553 br * (else_inst - if_inst + 1));
1554 }
1555
1556 /* Patch ELSE -> ENDIF */
1557 if (devinfo->gen < 6) {
1558 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1559 * matching ENDIF.
1560 */
1561 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1562 br * (endif_inst - else_inst + 1));
1563 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1564 } else if (devinfo->gen == 6) {
1565 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1566 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1567 br * (endif_inst - else_inst));
1568 } else {
1569 /* The IF instruction's JIP should point just past the ELSE */
1570 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1571 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1572 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1573 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1574 if (devinfo->gen >= 8) {
1575 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1576 * should point to ENDIF.
1577 */
1578 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1579 }
1580 }
1581 }
1582 }
1583
1584 void
1585 brw_ELSE(struct brw_codegen *p)
1586 {
1587 const struct gen_device_info *devinfo = p->devinfo;
1588 brw_inst *insn;
1589
1590 insn = next_insn(p, BRW_OPCODE_ELSE);
1591
1592 if (devinfo->gen < 6) {
1593 brw_set_dest(p, insn, brw_ip_reg());
1594 brw_set_src0(p, insn, brw_ip_reg());
1595 brw_set_src1(p, insn, brw_imm_d(0x0));
1596 } else if (devinfo->gen == 6) {
1597 brw_set_dest(p, insn, brw_imm_w(0));
1598 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1599 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1600 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1601 } else if (devinfo->gen == 7) {
1602 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1603 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1604 brw_set_src1(p, insn, brw_imm_w(0));
1605 brw_inst_set_jip(devinfo, insn, 0);
1606 brw_inst_set_uip(devinfo, insn, 0);
1607 } else {
1608 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1609 if (devinfo->gen < 12)
1610 brw_set_src0(p, insn, brw_imm_d(0));
1611 brw_inst_set_jip(devinfo, insn, 0);
1612 brw_inst_set_uip(devinfo, insn, 0);
1613 }
1614
1615 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1616 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1617 if (!p->single_program_flow && devinfo->gen < 6)
1618 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1619
1620 push_if_stack(p, insn);
1621 }
1622
1623 void
1624 brw_ENDIF(struct brw_codegen *p)
1625 {
1626 const struct gen_device_info *devinfo = p->devinfo;
1627 brw_inst *insn = NULL;
1628 brw_inst *else_inst = NULL;
1629 brw_inst *if_inst = NULL;
1630 brw_inst *tmp;
1631 bool emit_endif = true;
1632
1633 /* In single program flow mode, we can express IF and ELSE instructions
1634 * equivalently as ADD instructions that operate on IP. On platforms prior
1635 * to Gen6, flow control instructions cause an implied thread switch, so
1636 * this is a significant savings.
1637 *
1638 * However, on Gen6, writing to IP doesn't work in single program flow mode
1639 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1640 * not be updated by non-flow control instructions."). And on later
1641 * platforms, there is no significant benefit to converting control flow
1642 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1643 * Gen5.
1644 */
1645 if (devinfo->gen < 6 && p->single_program_flow)
1646 emit_endif = false;
1647
1648 /*
1649 * A single next_insn() may change the base address of instruction store
1650 * memory(p->store), so call it first before referencing the instruction
1651 * store pointer from an index
1652 */
1653 if (emit_endif)
1654 insn = next_insn(p, BRW_OPCODE_ENDIF);
1655
1656 /* Pop the IF and (optional) ELSE instructions from the stack */
1657 p->if_depth_in_loop[p->loop_stack_depth]--;
1658 tmp = pop_if_stack(p);
1659 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1660 else_inst = tmp;
1661 tmp = pop_if_stack(p);
1662 }
1663 if_inst = tmp;
1664
1665 if (!emit_endif) {
1666 /* ENDIF is useless; don't bother emitting it. */
1667 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1668 return;
1669 }
1670
1671 if (devinfo->gen < 6) {
1672 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1673 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1674 brw_set_src1(p, insn, brw_imm_d(0x0));
1675 } else if (devinfo->gen == 6) {
1676 brw_set_dest(p, insn, brw_imm_w(0));
1677 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1678 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1679 } else if (devinfo->gen == 7) {
1680 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1681 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1682 brw_set_src1(p, insn, brw_imm_w(0));
1683 } else {
1684 brw_set_src0(p, insn, brw_imm_d(0));
1685 }
1686
1687 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1688 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1689 if (devinfo->gen < 6)
1690 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1691
1692 /* Also pop item off the stack in the endif instruction: */
1693 if (devinfo->gen < 6) {
1694 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1695 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1696 } else if (devinfo->gen == 6) {
1697 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1698 } else {
1699 brw_inst_set_jip(devinfo, insn, 2);
1700 }
1701 patch_IF_ELSE(p, if_inst, else_inst, insn);
1702 }
1703
1704 brw_inst *
1705 brw_BREAK(struct brw_codegen *p)
1706 {
1707 const struct gen_device_info *devinfo = p->devinfo;
1708 brw_inst *insn;
1709
1710 insn = next_insn(p, BRW_OPCODE_BREAK);
1711 if (devinfo->gen >= 8) {
1712 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1713 brw_set_src0(p, insn, brw_imm_d(0x0));
1714 } else if (devinfo->gen >= 6) {
1715 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1716 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1717 brw_set_src1(p, insn, brw_imm_d(0x0));
1718 } else {
1719 brw_set_dest(p, insn, brw_ip_reg());
1720 brw_set_src0(p, insn, brw_ip_reg());
1721 brw_set_src1(p, insn, brw_imm_d(0x0));
1722 brw_inst_set_gen4_pop_count(devinfo, insn,
1723 p->if_depth_in_loop[p->loop_stack_depth]);
1724 }
1725 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1726 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1727
1728 return insn;
1729 }
1730
1731 brw_inst *
1732 brw_CONT(struct brw_codegen *p)
1733 {
1734 const struct gen_device_info *devinfo = p->devinfo;
1735 brw_inst *insn;
1736
1737 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1738 brw_set_dest(p, insn, brw_ip_reg());
1739 if (devinfo->gen >= 8) {
1740 brw_set_src0(p, insn, brw_imm_d(0x0));
1741 } else {
1742 brw_set_src0(p, insn, brw_ip_reg());
1743 brw_set_src1(p, insn, brw_imm_d(0x0));
1744 }
1745
1746 if (devinfo->gen < 6) {
1747 brw_inst_set_gen4_pop_count(devinfo, insn,
1748 p->if_depth_in_loop[p->loop_stack_depth]);
1749 }
1750 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1751 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1752 return insn;
1753 }
1754
1755 brw_inst *
1756 gen6_HALT(struct brw_codegen *p)
1757 {
1758 const struct gen_device_info *devinfo = p->devinfo;
1759 brw_inst *insn;
1760
1761 insn = next_insn(p, BRW_OPCODE_HALT);
1762 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1763 if (devinfo->gen < 8) {
1764 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1765 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1766 } else if (devinfo->gen < 12) {
1767 brw_set_src0(p, insn, brw_imm_d(0x0));
1768 }
1769
1770 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1771 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1772 return insn;
1773 }
1774
1775 /* DO/WHILE loop:
1776 *
1777 * The DO/WHILE is just an unterminated loop -- break or continue are
1778 * used for control within the loop. We have a few ways they can be
1779 * done.
1780 *
1781 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1782 * jip and no DO instruction.
1783 *
1784 * For non-uniform control flow pre-gen6, there's a DO instruction to
1785 * push the mask, and a WHILE to jump back, and BREAK to get out and
1786 * pop the mask.
1787 *
1788 * For gen6, there's no more mask stack, so no need for DO. WHILE
1789 * just points back to the first instruction of the loop.
1790 */
1791 brw_inst *
1792 brw_DO(struct brw_codegen *p, unsigned execute_size)
1793 {
1794 const struct gen_device_info *devinfo = p->devinfo;
1795
1796 if (devinfo->gen >= 6 || p->single_program_flow) {
1797 push_loop_stack(p, &p->store[p->nr_insn]);
1798 return &p->store[p->nr_insn];
1799 } else {
1800 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1801
1802 push_loop_stack(p, insn);
1803
1804 /* Override the defaults for this instruction:
1805 */
1806 brw_set_dest(p, insn, brw_null_reg());
1807 brw_set_src0(p, insn, brw_null_reg());
1808 brw_set_src1(p, insn, brw_null_reg());
1809
1810 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1811 brw_inst_set_exec_size(devinfo, insn, execute_size);
1812 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1813
1814 return insn;
1815 }
1816 }
1817
1818 /**
1819 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1820 * instruction here.
1821 *
1822 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1823 * nesting, since it can always just point to the end of the block/current loop.
1824 */
1825 static void
1826 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1827 {
1828 const struct gen_device_info *devinfo = p->devinfo;
1829 brw_inst *do_inst = get_inner_do_insn(p);
1830 brw_inst *inst;
1831 unsigned br = brw_jump_scale(devinfo);
1832
1833 assert(devinfo->gen < 6);
1834
1835 for (inst = while_inst - 1; inst != do_inst; inst--) {
1836 /* If the jump count is != 0, that means that this instruction has already
1837 * been patched because it's part of a loop inside of the one we're
1838 * patching.
1839 */
1840 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1841 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1842 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1843 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1844 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1845 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1846 }
1847 }
1848 }
1849
1850 brw_inst *
1851 brw_WHILE(struct brw_codegen *p)
1852 {
1853 const struct gen_device_info *devinfo = p->devinfo;
1854 brw_inst *insn, *do_insn;
1855 unsigned br = brw_jump_scale(devinfo);
1856
1857 if (devinfo->gen >= 6) {
1858 insn = next_insn(p, BRW_OPCODE_WHILE);
1859 do_insn = get_inner_do_insn(p);
1860
1861 if (devinfo->gen >= 8) {
1862 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1863 if (devinfo->gen < 12)
1864 brw_set_src0(p, insn, brw_imm_d(0));
1865 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1866 } else if (devinfo->gen == 7) {
1867 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1868 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1869 brw_set_src1(p, insn, brw_imm_w(0));
1870 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1871 } else {
1872 brw_set_dest(p, insn, brw_imm_w(0));
1873 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1874 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1875 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1876 }
1877
1878 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1879
1880 } else {
1881 if (p->single_program_flow) {
1882 insn = next_insn(p, BRW_OPCODE_ADD);
1883 do_insn = get_inner_do_insn(p);
1884
1885 brw_set_dest(p, insn, brw_ip_reg());
1886 brw_set_src0(p, insn, brw_ip_reg());
1887 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1888 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1889 } else {
1890 insn = next_insn(p, BRW_OPCODE_WHILE);
1891 do_insn = get_inner_do_insn(p);
1892
1893 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1894
1895 brw_set_dest(p, insn, brw_ip_reg());
1896 brw_set_src0(p, insn, brw_ip_reg());
1897 brw_set_src1(p, insn, brw_imm_d(0));
1898
1899 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1900 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1901 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1902
1903 brw_patch_break_cont(p, insn);
1904 }
1905 }
1906 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1907
1908 p->loop_stack_depth--;
1909
1910 return insn;
1911 }
1912
1913 /* FORWARD JUMPS:
1914 */
1915 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1916 {
1917 const struct gen_device_info *devinfo = p->devinfo;
1918 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1919 unsigned jmpi = 1;
1920
1921 if (devinfo->gen >= 5)
1922 jmpi = 2;
1923
1924 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1925 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1926
1927 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1928 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1929 }
1930
1931 /* To integrate with the above, it makes sense that the comparison
1932 * instruction should populate the flag register. It might be simpler
1933 * just to use the flag reg for most WM tasks?
1934 */
1935 void brw_CMP(struct brw_codegen *p,
1936 struct brw_reg dest,
1937 unsigned conditional,
1938 struct brw_reg src0,
1939 struct brw_reg src1)
1940 {
1941 const struct gen_device_info *devinfo = p->devinfo;
1942 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1943
1944 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1945 brw_set_dest(p, insn, dest);
1946 brw_set_src0(p, insn, src0);
1947 brw_set_src1(p, insn, src1);
1948
1949 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1950 * page says:
1951 * "Any CMP instruction with a null destination must use a {switch}."
1952 *
1953 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1954 * mentioned on their work-arounds pages.
1955 */
1956 if (devinfo->gen == 7) {
1957 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1958 dest.nr == BRW_ARF_NULL) {
1959 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1960 }
1961 }
1962 }
1963
1964 /***********************************************************************
1965 * Helpers for the various SEND message types:
1966 */
1967
1968 /** Extended math function, float[8].
1969 */
1970 void gen4_math(struct brw_codegen *p,
1971 struct brw_reg dest,
1972 unsigned function,
1973 unsigned msg_reg_nr,
1974 struct brw_reg src,
1975 unsigned precision )
1976 {
1977 const struct gen_device_info *devinfo = p->devinfo;
1978 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1979 unsigned data_type;
1980 if (has_scalar_region(src)) {
1981 data_type = BRW_MATH_DATA_SCALAR;
1982 } else {
1983 data_type = BRW_MATH_DATA_VECTOR;
1984 }
1985
1986 assert(devinfo->gen < 6);
1987
1988 /* Example code doesn't set predicate_control for send
1989 * instructions.
1990 */
1991 brw_inst_set_pred_control(devinfo, insn, 0);
1992 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1993
1994 brw_set_dest(p, insn, dest);
1995 brw_set_src0(p, insn, src);
1996 brw_set_math_message(p,
1997 insn,
1998 function,
1999 src.type == BRW_REGISTER_TYPE_D,
2000 precision,
2001 data_type);
2002 }
2003
2004 void gen6_math(struct brw_codegen *p,
2005 struct brw_reg dest,
2006 unsigned function,
2007 struct brw_reg src0,
2008 struct brw_reg src1)
2009 {
2010 const struct gen_device_info *devinfo = p->devinfo;
2011 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2012
2013 assert(devinfo->gen >= 6);
2014
2015 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2016 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2017
2018 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2019 if (devinfo->gen == 6) {
2020 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2021 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2022 }
2023
2024 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2025 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2026 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2027 assert(src0.type != BRW_REGISTER_TYPE_F);
2028 assert(src1.type != BRW_REGISTER_TYPE_F);
2029 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2030 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2031 } else {
2032 assert(src0.type == BRW_REGISTER_TYPE_F ||
2033 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2034 assert(src1.type == BRW_REGISTER_TYPE_F ||
2035 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2036 }
2037
2038 /* Source modifiers are ignored for extended math instructions on Gen6. */
2039 if (devinfo->gen == 6) {
2040 assert(!src0.negate);
2041 assert(!src0.abs);
2042 assert(!src1.negate);
2043 assert(!src1.abs);
2044 }
2045
2046 brw_inst_set_math_function(devinfo, insn, function);
2047
2048 brw_set_dest(p, insn, dest);
2049 brw_set_src0(p, insn, src0);
2050 brw_set_src1(p, insn, src1);
2051 }
2052
2053 /**
2054 * Return the right surface index to access the thread scratch space using
2055 * stateless dataport messages.
2056 */
2057 unsigned
2058 brw_scratch_surface_idx(const struct brw_codegen *p)
2059 {
2060 /* The scratch space is thread-local so IA coherency is unnecessary. */
2061 if (p->devinfo->gen >= 8)
2062 return GEN8_BTI_STATELESS_NON_COHERENT;
2063 else
2064 return BRW_BTI_STATELESS;
2065 }
2066
2067 /**
2068 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2069 * using a constant offset per channel.
2070 *
2071 * The offset must be aligned to oword size (16 bytes). Used for
2072 * register spilling.
2073 */
2074 void brw_oword_block_write_scratch(struct brw_codegen *p,
2075 struct brw_reg mrf,
2076 int num_regs,
2077 unsigned offset)
2078 {
2079 const struct gen_device_info *devinfo = p->devinfo;
2080 const unsigned target_cache =
2081 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2082 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2083 BRW_SFID_DATAPORT_WRITE);
2084 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2085 uint32_t msg_type;
2086
2087 if (devinfo->gen >= 6)
2088 offset /= 16;
2089
2090 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2091
2092 const unsigned mlen = 1 + num_regs;
2093
2094 /* Set up the message header. This is g0, with g0.2 filled with
2095 * the offset. We don't want to leave our offset around in g0 or
2096 * it'll screw up texture samples, so set it up inside the message
2097 * reg.
2098 */
2099 {
2100 brw_push_insn_state(p);
2101 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2102 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2103 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2104 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2105
2106 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2107
2108 /* set message header global offset field (reg 0, element 2) */
2109 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2110 brw_set_default_swsb(p, tgl_swsb_null());
2111 brw_MOV(p,
2112 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2113 mrf.nr,
2114 2), BRW_REGISTER_TYPE_UD),
2115 brw_imm_ud(offset));
2116
2117 brw_pop_insn_state(p);
2118 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2119 }
2120
2121 {
2122 struct brw_reg dest;
2123 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2124 int send_commit_msg;
2125 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2126 BRW_REGISTER_TYPE_UW);
2127
2128 brw_inst_set_sfid(devinfo, insn, target_cache);
2129 brw_inst_set_compression(devinfo, insn, false);
2130
2131 if (brw_inst_exec_size(devinfo, insn) >= 16)
2132 src_header = vec16(src_header);
2133
2134 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2135 if (devinfo->gen < 6)
2136 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2137
2138 /* Until gen6, writes followed by reads from the same location
2139 * are not guaranteed to be ordered unless write_commit is set.
2140 * If set, then a no-op write is issued to the destination
2141 * register to set a dependency, and a read from the destination
2142 * can be used to ensure the ordering.
2143 *
2144 * For gen6, only writes between different threads need ordering
2145 * protection. Our use of DP writes is all about register
2146 * spilling within a thread.
2147 */
2148 if (devinfo->gen >= 6) {
2149 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2150 send_commit_msg = 0;
2151 } else {
2152 dest = src_header;
2153 send_commit_msg = 1;
2154 }
2155
2156 brw_set_dest(p, insn, dest);
2157 if (devinfo->gen >= 6) {
2158 brw_set_src0(p, insn, mrf);
2159 } else {
2160 brw_set_src0(p, insn, brw_null_reg());
2161 }
2162
2163 if (devinfo->gen >= 6)
2164 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2165 else
2166 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2167
2168 brw_set_desc(p, insn,
2169 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2170 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2171 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2172 msg_type, 0, /* not a render target */
2173 send_commit_msg));
2174 }
2175 }
2176
2177
2178 /**
2179 * Read a block of owords (half a GRF each) from the scratch buffer
2180 * using a constant index per channel.
2181 *
2182 * Offset must be aligned to oword size (16 bytes). Used for register
2183 * spilling.
2184 */
2185 void
2186 brw_oword_block_read_scratch(struct brw_codegen *p,
2187 struct brw_reg dest,
2188 struct brw_reg mrf,
2189 int num_regs,
2190 unsigned offset)
2191 {
2192 const struct gen_device_info *devinfo = p->devinfo;
2193 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2194
2195 if (devinfo->gen >= 6)
2196 offset /= 16;
2197
2198 if (p->devinfo->gen >= 7) {
2199 /* On gen 7 and above, we no longer have message registers and we can
2200 * send from any register we want. By using the destination register
2201 * for the message, we guarantee that the implied message write won't
2202 * accidentally overwrite anything. This has been a problem because
2203 * the MRF registers and source for the final FB write are both fixed
2204 * and may overlap.
2205 */
2206 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2207 } else {
2208 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2209 }
2210 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2211
2212 const unsigned rlen = num_regs;
2213 const unsigned target_cache =
2214 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2215 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2216 BRW_SFID_DATAPORT_READ);
2217
2218 {
2219 brw_push_insn_state(p);
2220 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2221 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2222 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2223 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2224
2225 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2226
2227 /* set message header global offset field (reg 0, element 2) */
2228 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2229 brw_set_default_swsb(p, tgl_swsb_null());
2230 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2231
2232 brw_pop_insn_state(p);
2233 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2234 }
2235
2236 {
2237 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2238
2239 brw_inst_set_sfid(devinfo, insn, target_cache);
2240 assert(brw_inst_pred_control(devinfo, insn) == 0);
2241 brw_inst_set_compression(devinfo, insn, false);
2242
2243 brw_set_dest(p, insn, dest); /* UW? */
2244 if (devinfo->gen >= 6) {
2245 brw_set_src0(p, insn, mrf);
2246 } else {
2247 brw_set_src0(p, insn, brw_null_reg());
2248 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2249 }
2250
2251 brw_set_desc(p, insn,
2252 brw_message_desc(devinfo, 1, rlen, true) |
2253 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2254 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2255 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2256 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2257 }
2258 }
2259
2260 void
2261 gen7_block_read_scratch(struct brw_codegen *p,
2262 struct brw_reg dest,
2263 int num_regs,
2264 unsigned offset)
2265 {
2266 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2267 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2268
2269 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2270
2271 /* The HW requires that the header is present; this is to get the g0.5
2272 * scratch offset.
2273 */
2274 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2275
2276 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2277 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2278 * is 32 bytes, which happens to be the size of a register.
2279 */
2280 offset /= REG_SIZE;
2281 assert(offset < (1 << 12));
2282
2283 gen7_set_dp_scratch_message(p, insn,
2284 false, /* scratch read */
2285 false, /* OWords */
2286 false, /* invalidate after read */
2287 num_regs,
2288 offset,
2289 1, /* mlen: just g0 */
2290 num_regs, /* rlen */
2291 true); /* header present */
2292 }
2293
2294 /**
2295 * Read float[4] vectors from the data port constant cache.
2296 * Location (in buffer) should be a multiple of 16.
2297 * Used for fetching shader constants.
2298 */
2299 void brw_oword_block_read(struct brw_codegen *p,
2300 struct brw_reg dest,
2301 struct brw_reg mrf,
2302 uint32_t offset,
2303 uint32_t bind_table_index)
2304 {
2305 const struct gen_device_info *devinfo = p->devinfo;
2306 const unsigned target_cache =
2307 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2308 BRW_SFID_DATAPORT_READ);
2309 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2310 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2311
2312 /* On newer hardware, offset is in units of owords. */
2313 if (devinfo->gen >= 6)
2314 offset /= 16;
2315
2316 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2317
2318 brw_push_insn_state(p);
2319 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2320 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2321 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2322
2323 brw_push_insn_state(p);
2324 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2325 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2326 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2327
2328 /* set message header global offset field (reg 0, element 2) */
2329 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2330 brw_set_default_swsb(p, tgl_swsb_null());
2331 brw_MOV(p,
2332 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2333 mrf.nr,
2334 2), BRW_REGISTER_TYPE_UD),
2335 brw_imm_ud(offset));
2336 brw_pop_insn_state(p);
2337
2338 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2339
2340 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2341
2342 brw_inst_set_sfid(devinfo, insn, target_cache);
2343
2344 /* cast dest to a uword[8] vector */
2345 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2346
2347 brw_set_dest(p, insn, dest);
2348 if (devinfo->gen >= 6) {
2349 brw_set_src0(p, insn, mrf);
2350 } else {
2351 brw_set_src0(p, insn, brw_null_reg());
2352 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2353 }
2354
2355 brw_set_desc(p, insn,
2356 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2357 brw_dp_read_desc(devinfo, bind_table_index,
2358 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2359 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2360 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2361
2362 brw_pop_insn_state(p);
2363 }
2364
2365 brw_inst *
2366 brw_fb_WRITE(struct brw_codegen *p,
2367 struct brw_reg payload,
2368 struct brw_reg implied_header,
2369 unsigned msg_control,
2370 unsigned binding_table_index,
2371 unsigned msg_length,
2372 unsigned response_length,
2373 bool eot,
2374 bool last_render_target,
2375 bool header_present)
2376 {
2377 const struct gen_device_info *devinfo = p->devinfo;
2378 const unsigned target_cache =
2379 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2380 BRW_SFID_DATAPORT_WRITE);
2381 brw_inst *insn;
2382 unsigned msg_type;
2383 struct brw_reg dest, src0;
2384
2385 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2386 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2387 else
2388 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2389
2390 if (devinfo->gen >= 6) {
2391 insn = next_insn(p, BRW_OPCODE_SENDC);
2392 } else {
2393 insn = next_insn(p, BRW_OPCODE_SEND);
2394 }
2395 brw_inst_set_sfid(devinfo, insn, target_cache);
2396 brw_inst_set_compression(devinfo, insn, false);
2397
2398 if (devinfo->gen >= 6) {
2399 /* headerless version, just submit color payload */
2400 src0 = payload;
2401
2402 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2403 } else {
2404 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2405 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2406 src0 = implied_header;
2407
2408 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2409 }
2410
2411 brw_set_dest(p, insn, dest);
2412 brw_set_src0(p, insn, src0);
2413 brw_set_desc(p, insn,
2414 brw_message_desc(devinfo, msg_length, response_length,
2415 header_present) |
2416 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2417 msg_type, last_render_target,
2418 0 /* send_commit_msg */));
2419 brw_inst_set_eot(devinfo, insn, eot);
2420
2421 return insn;
2422 }
2423
2424 brw_inst *
2425 gen9_fb_READ(struct brw_codegen *p,
2426 struct brw_reg dst,
2427 struct brw_reg payload,
2428 unsigned binding_table_index,
2429 unsigned msg_length,
2430 unsigned response_length,
2431 bool per_sample)
2432 {
2433 const struct gen_device_info *devinfo = p->devinfo;
2434 assert(devinfo->gen >= 9);
2435 const unsigned msg_subtype =
2436 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2437 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2438
2439 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2440 brw_set_dest(p, insn, dst);
2441 brw_set_src0(p, insn, payload);
2442 brw_set_desc(
2443 p, insn,
2444 brw_message_desc(devinfo, msg_length, response_length, true) |
2445 brw_dp_read_desc(devinfo, binding_table_index,
2446 per_sample << 5 | msg_subtype,
2447 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2448 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2449 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2450
2451 return insn;
2452 }
2453
2454 /**
2455 * Texture sample instruction.
2456 * Note: the msg_type plus msg_length values determine exactly what kind
2457 * of sampling operation is performed. See volume 4, page 161 of docs.
2458 */
2459 void brw_SAMPLE(struct brw_codegen *p,
2460 struct brw_reg dest,
2461 unsigned msg_reg_nr,
2462 struct brw_reg src0,
2463 unsigned binding_table_index,
2464 unsigned sampler,
2465 unsigned msg_type,
2466 unsigned response_length,
2467 unsigned msg_length,
2468 unsigned header_present,
2469 unsigned simd_mode,
2470 unsigned return_format)
2471 {
2472 const struct gen_device_info *devinfo = p->devinfo;
2473 brw_inst *insn;
2474
2475 if (msg_reg_nr != -1)
2476 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2477
2478 insn = next_insn(p, BRW_OPCODE_SEND);
2479 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2480 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2481
2482 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2483 *
2484 * "Instruction compression is not allowed for this instruction (that
2485 * is, send). The hardware behavior is undefined if this instruction is
2486 * set as compressed. However, compress control can be set to "SecHalf"
2487 * to affect the EMask generation."
2488 *
2489 * No similar wording is found in later PRMs, but there are examples
2490 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2491 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2492 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2493 */
2494 brw_inst_set_compression(devinfo, insn, false);
2495
2496 if (devinfo->gen < 6)
2497 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2498
2499 brw_set_dest(p, insn, dest);
2500 brw_set_src0(p, insn, src0);
2501 brw_set_desc(p, insn,
2502 brw_message_desc(devinfo, msg_length, response_length,
2503 header_present) |
2504 brw_sampler_desc(devinfo, binding_table_index, sampler,
2505 msg_type, simd_mode, return_format));
2506 }
2507
2508 /* Adjust the message header's sampler state pointer to
2509 * select the correct group of 16 samplers.
2510 */
2511 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2512 struct brw_reg header,
2513 struct brw_reg sampler_index)
2514 {
2515 /* The "Sampler Index" field can only store values between 0 and 15.
2516 * However, we can add an offset to the "Sampler State Pointer"
2517 * field, effectively selecting a different set of 16 samplers.
2518 *
2519 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2520 * offset, and each sampler state is only 16-bytes, so we can't
2521 * exclusively use the offset - we have to use both.
2522 */
2523
2524 const struct gen_device_info *devinfo = p->devinfo;
2525
2526 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2527 const int sampler_state_size = 16; /* 16 bytes */
2528 uint32_t sampler = sampler_index.ud;
2529
2530 if (sampler >= 16) {
2531 assert(devinfo->is_haswell || devinfo->gen >= 8);
2532 brw_ADD(p,
2533 get_element_ud(header, 3),
2534 get_element_ud(brw_vec8_grf(0, 0), 3),
2535 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2536 }
2537 } else {
2538 /* Non-const sampler array indexing case */
2539 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2540 return;
2541 }
2542
2543 struct brw_reg temp = get_element_ud(header, 3);
2544
2545 brw_push_insn_state(p);
2546 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2547 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2548 brw_SHL(p, temp, temp, brw_imm_ud(4));
2549 brw_ADD(p,
2550 get_element_ud(header, 3),
2551 get_element_ud(brw_vec8_grf(0, 0), 3),
2552 temp);
2553 brw_pop_insn_state(p);
2554 }
2555 }
2556
2557 /* All these variables are pretty confusing - we might be better off
2558 * using bitmasks and macros for this, in the old style. Or perhaps
2559 * just having the caller instantiate the fields in dword3 itself.
2560 */
2561 void brw_urb_WRITE(struct brw_codegen *p,
2562 struct brw_reg dest,
2563 unsigned msg_reg_nr,
2564 struct brw_reg src0,
2565 enum brw_urb_write_flags flags,
2566 unsigned msg_length,
2567 unsigned response_length,
2568 unsigned offset,
2569 unsigned swizzle)
2570 {
2571 const struct gen_device_info *devinfo = p->devinfo;
2572 brw_inst *insn;
2573
2574 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2575
2576 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2577 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2578 brw_push_insn_state(p);
2579 brw_set_default_access_mode(p, BRW_ALIGN_1);
2580 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2581 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2582 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2583 BRW_REGISTER_TYPE_UD),
2584 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2585 brw_imm_ud(0xff00));
2586 brw_pop_insn_state(p);
2587 }
2588
2589 insn = next_insn(p, BRW_OPCODE_SEND);
2590
2591 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2592
2593 brw_set_dest(p, insn, dest);
2594 brw_set_src0(p, insn, src0);
2595 brw_set_src1(p, insn, brw_imm_d(0));
2596
2597 if (devinfo->gen < 6)
2598 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2599
2600 brw_set_urb_message(p,
2601 insn,
2602 flags,
2603 msg_length,
2604 response_length,
2605 offset,
2606 swizzle);
2607 }
2608
2609 void
2610 brw_send_indirect_message(struct brw_codegen *p,
2611 unsigned sfid,
2612 struct brw_reg dst,
2613 struct brw_reg payload,
2614 struct brw_reg desc,
2615 unsigned desc_imm,
2616 bool eot)
2617 {
2618 const struct gen_device_info *devinfo = p->devinfo;
2619 struct brw_inst *send;
2620
2621 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2622
2623 assert(desc.type == BRW_REGISTER_TYPE_UD);
2624
2625 if (desc.file == BRW_IMMEDIATE_VALUE) {
2626 send = next_insn(p, BRW_OPCODE_SEND);
2627 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2628 brw_set_desc(p, send, desc.ud | desc_imm);
2629 } else {
2630 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2631 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2632
2633 brw_push_insn_state(p);
2634 brw_set_default_access_mode(p, BRW_ALIGN_1);
2635 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2636 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2637 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2638 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2639
2640 /* Load the indirect descriptor to an address register using OR so the
2641 * caller can specify additional descriptor bits with the desc_imm
2642 * immediate.
2643 */
2644 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2645
2646 brw_pop_insn_state(p);
2647
2648 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2649 send = next_insn(p, BRW_OPCODE_SEND);
2650 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2651
2652 if (devinfo->gen >= 12)
2653 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2654 else
2655 brw_set_src1(p, send, addr);
2656 }
2657
2658 brw_set_dest(p, send, dst);
2659 brw_inst_set_sfid(devinfo, send, sfid);
2660 brw_inst_set_eot(devinfo, send, eot);
2661 }
2662
2663 void
2664 brw_send_indirect_split_message(struct brw_codegen *p,
2665 unsigned sfid,
2666 struct brw_reg dst,
2667 struct brw_reg payload0,
2668 struct brw_reg payload1,
2669 struct brw_reg desc,
2670 unsigned desc_imm,
2671 struct brw_reg ex_desc,
2672 unsigned ex_desc_imm,
2673 bool eot)
2674 {
2675 const struct gen_device_info *devinfo = p->devinfo;
2676 struct brw_inst *send;
2677
2678 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2679
2680 assert(desc.type == BRW_REGISTER_TYPE_UD);
2681
2682 if (desc.file == BRW_IMMEDIATE_VALUE) {
2683 desc.ud |= desc_imm;
2684 } else {
2685 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2686 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2687
2688 brw_push_insn_state(p);
2689 brw_set_default_access_mode(p, BRW_ALIGN_1);
2690 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2691 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2692 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2693 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2694
2695 /* Load the indirect descriptor to an address register using OR so the
2696 * caller can specify additional descriptor bits with the desc_imm
2697 * immediate.
2698 */
2699 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2700
2701 brw_pop_insn_state(p);
2702 desc = addr;
2703
2704 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2705 }
2706
2707 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2708 (ex_desc.ud & INTEL_MASK(15, 12)) == 0) {
2709 ex_desc.ud |= ex_desc_imm;
2710 } else {
2711 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2712 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2713
2714 brw_push_insn_state(p);
2715 brw_set_default_access_mode(p, BRW_ALIGN_1);
2716 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2717 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2718 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2719 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2720
2721 /* Load the indirect extended descriptor to an address register using OR
2722 * so the caller can specify additional descriptor bits with the
2723 * desc_imm immediate.
2724 *
2725 * Even though the instruction dispatcher always pulls the SFID and EOT
2726 * fields from the instruction itself, actual external unit which
2727 * processes the message gets the SFID and EOT from the extended
2728 * descriptor which comes from the address register. If we don't OR
2729 * those two bits in, the external unit may get confused and hang.
2730 */
2731 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2732
2733 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2734 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2735 * we may have fallen back to an indirect extended descriptor.
2736 */
2737 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2738 } else {
2739 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2740 }
2741
2742 brw_pop_insn_state(p);
2743 ex_desc = addr;
2744
2745 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2746 }
2747
2748 send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2749 brw_set_dest(p, send, dst);
2750 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2751 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2752
2753 if (desc.file == BRW_IMMEDIATE_VALUE) {
2754 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2755 brw_inst_set_send_desc(devinfo, send, desc.ud);
2756 } else {
2757 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2758 assert(desc.nr == BRW_ARF_ADDRESS);
2759 assert(desc.subnr == 0);
2760 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2761 }
2762
2763 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2764 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2765 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2766 } else {
2767 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2768 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2769 assert((ex_desc.subnr & 0x3) == 0);
2770 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2771 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2772 }
2773
2774 brw_inst_set_sfid(devinfo, send, sfid);
2775 brw_inst_set_eot(devinfo, send, eot);
2776 }
2777
2778 static void
2779 brw_send_indirect_surface_message(struct brw_codegen *p,
2780 unsigned sfid,
2781 struct brw_reg dst,
2782 struct brw_reg payload,
2783 struct brw_reg surface,
2784 unsigned desc_imm)
2785 {
2786 if (surface.file != BRW_IMMEDIATE_VALUE) {
2787 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2788 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2789
2790 brw_push_insn_state(p);
2791 brw_set_default_access_mode(p, BRW_ALIGN_1);
2792 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2793 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2794 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2795 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2796
2797 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2798 * some surface array is accessed out of bounds.
2799 */
2800 brw_AND(p, addr,
2801 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2802 BRW_GET_SWZ(surface.swizzle, 0)),
2803 brw_imm_ud(0xff));
2804
2805 brw_pop_insn_state(p);
2806
2807 surface = addr;
2808 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2809 }
2810
2811 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2812 }
2813
2814 static bool
2815 while_jumps_before_offset(const struct gen_device_info *devinfo,
2816 brw_inst *insn, int while_offset, int start_offset)
2817 {
2818 int scale = 16 / brw_jump_scale(devinfo);
2819 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2820 : brw_inst_jip(devinfo, insn);
2821 assert(jip < 0);
2822 return while_offset + jip * scale <= start_offset;
2823 }
2824
2825
2826 static int
2827 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2828 {
2829 int offset;
2830 void *store = p->store;
2831 const struct gen_device_info *devinfo = p->devinfo;
2832
2833 int depth = 0;
2834
2835 for (offset = next_offset(devinfo, store, start_offset);
2836 offset < p->next_insn_offset;
2837 offset = next_offset(devinfo, store, offset)) {
2838 brw_inst *insn = store + offset;
2839
2840 switch (brw_inst_opcode(devinfo, insn)) {
2841 case BRW_OPCODE_IF:
2842 depth++;
2843 break;
2844 case BRW_OPCODE_ENDIF:
2845 if (depth == 0)
2846 return offset;
2847 depth--;
2848 break;
2849 case BRW_OPCODE_WHILE:
2850 /* If the while doesn't jump before our instruction, it's the end
2851 * of a sibling do...while loop. Ignore it.
2852 */
2853 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2854 continue;
2855 /* fallthrough */
2856 case BRW_OPCODE_ELSE:
2857 case BRW_OPCODE_HALT:
2858 if (depth == 0)
2859 return offset;
2860 default:
2861 break;
2862 }
2863 }
2864
2865 return 0;
2866 }
2867
2868 /* There is no DO instruction on gen6, so to find the end of the loop
2869 * we have to see if the loop is jumping back before our start
2870 * instruction.
2871 */
2872 static int
2873 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2874 {
2875 const struct gen_device_info *devinfo = p->devinfo;
2876 int offset;
2877 void *store = p->store;
2878
2879 assert(devinfo->gen >= 6);
2880
2881 /* Always start after the instruction (such as a WHILE) we're trying to fix
2882 * up.
2883 */
2884 for (offset = next_offset(devinfo, store, start_offset);
2885 offset < p->next_insn_offset;
2886 offset = next_offset(devinfo, store, offset)) {
2887 brw_inst *insn = store + offset;
2888
2889 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2890 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2891 return offset;
2892 }
2893 }
2894 assert(!"not reached");
2895 return start_offset;
2896 }
2897
2898 /* After program generation, go back and update the UIP and JIP of
2899 * BREAK, CONT, and HALT instructions to their correct locations.
2900 */
2901 void
2902 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2903 {
2904 const struct gen_device_info *devinfo = p->devinfo;
2905 int offset;
2906 int br = brw_jump_scale(devinfo);
2907 int scale = 16 / br;
2908 void *store = p->store;
2909
2910 if (devinfo->gen < 6)
2911 return;
2912
2913 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2914 brw_inst *insn = store + offset;
2915 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2916
2917 int block_end_offset = brw_find_next_block_end(p, offset);
2918 switch (brw_inst_opcode(devinfo, insn)) {
2919 case BRW_OPCODE_BREAK:
2920 assert(block_end_offset != 0);
2921 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2922 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2923 brw_inst_set_uip(devinfo, insn,
2924 (brw_find_loop_end(p, offset) - offset +
2925 (devinfo->gen == 6 ? 16 : 0)) / scale);
2926 break;
2927 case BRW_OPCODE_CONTINUE:
2928 assert(block_end_offset != 0);
2929 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2930 brw_inst_set_uip(devinfo, insn,
2931 (brw_find_loop_end(p, offset) - offset) / scale);
2932
2933 assert(brw_inst_uip(devinfo, insn) != 0);
2934 assert(brw_inst_jip(devinfo, insn) != 0);
2935 break;
2936
2937 case BRW_OPCODE_ENDIF: {
2938 int32_t jump = (block_end_offset == 0) ?
2939 1 * br : (block_end_offset - offset) / scale;
2940 if (devinfo->gen >= 7)
2941 brw_inst_set_jip(devinfo, insn, jump);
2942 else
2943 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2944 break;
2945 }
2946
2947 case BRW_OPCODE_HALT:
2948 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2949 *
2950 * "In case of the halt instruction not inside any conditional
2951 * code block, the value of <JIP> and <UIP> should be the
2952 * same. In case of the halt instruction inside conditional code
2953 * block, the <UIP> should be the end of the program, and the
2954 * <JIP> should be end of the most inner conditional code block."
2955 *
2956 * The uip will have already been set by whoever set up the
2957 * instruction.
2958 */
2959 if (block_end_offset == 0) {
2960 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2961 } else {
2962 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2963 }
2964 assert(brw_inst_uip(devinfo, insn) != 0);
2965 assert(brw_inst_jip(devinfo, insn) != 0);
2966 break;
2967
2968 default:
2969 break;
2970 }
2971 }
2972 }
2973
2974 void brw_ff_sync(struct brw_codegen *p,
2975 struct brw_reg dest,
2976 unsigned msg_reg_nr,
2977 struct brw_reg src0,
2978 bool allocate,
2979 unsigned response_length,
2980 bool eot)
2981 {
2982 const struct gen_device_info *devinfo = p->devinfo;
2983 brw_inst *insn;
2984
2985 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2986
2987 insn = next_insn(p, BRW_OPCODE_SEND);
2988 brw_set_dest(p, insn, dest);
2989 brw_set_src0(p, insn, src0);
2990 brw_set_src1(p, insn, brw_imm_d(0));
2991
2992 if (devinfo->gen < 6)
2993 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2994
2995 brw_set_ff_sync_message(p,
2996 insn,
2997 allocate,
2998 response_length,
2999 eot);
3000 }
3001
3002 /**
3003 * Emit the SEND instruction necessary to generate stream output data on Gen6
3004 * (for transform feedback).
3005 *
3006 * If send_commit_msg is true, this is the last piece of stream output data
3007 * from this thread, so send the data as a committed write. According to the
3008 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3009 *
3010 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3011 * writes are complete by sending the final write as a committed write."
3012 */
3013 void
3014 brw_svb_write(struct brw_codegen *p,
3015 struct brw_reg dest,
3016 unsigned msg_reg_nr,
3017 struct brw_reg src0,
3018 unsigned binding_table_index,
3019 bool send_commit_msg)
3020 {
3021 const struct gen_device_info *devinfo = p->devinfo;
3022 const unsigned target_cache =
3023 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
3024 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
3025 BRW_SFID_DATAPORT_WRITE);
3026 brw_inst *insn;
3027
3028 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
3029
3030 insn = next_insn(p, BRW_OPCODE_SEND);
3031 brw_inst_set_sfid(devinfo, insn, target_cache);
3032 brw_set_dest(p, insn, dest);
3033 brw_set_src0(p, insn, src0);
3034 brw_set_desc(p, insn,
3035 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3036 brw_dp_write_desc(devinfo, binding_table_index,
3037 0, /* msg_control: ignored */
3038 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3039 0, /* last_render_target: ignored */
3040 send_commit_msg)); /* send_commit_msg */
3041 }
3042
3043 static unsigned
3044 brw_surface_payload_size(struct brw_codegen *p,
3045 unsigned num_channels,
3046 unsigned exec_size /**< 0 for SIMD4x2 */)
3047 {
3048 if (exec_size == 0)
3049 return 1; /* SIMD4x2 */
3050 else if (exec_size <= 8)
3051 return num_channels;
3052 else
3053 return 2 * num_channels;
3054 }
3055
3056 void
3057 brw_untyped_atomic(struct brw_codegen *p,
3058 struct brw_reg dst,
3059 struct brw_reg payload,
3060 struct brw_reg surface,
3061 unsigned atomic_op,
3062 unsigned msg_length,
3063 bool response_expected,
3064 bool header_present)
3065 {
3066 const struct gen_device_info *devinfo = p->devinfo;
3067 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3068 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3069 GEN7_SFID_DATAPORT_DATA_CACHE);
3070 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3071 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3072 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3073 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3074 has_simd4x2 ? 0 : 8;
3075 const unsigned response_length =
3076 brw_surface_payload_size(p, response_expected, exec_size);
3077 const unsigned desc =
3078 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3079 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3080 response_expected);
3081 /* Mask out unused components -- This is especially important in Align16
3082 * mode on generations that don't have native support for SIMD4x2 atomics,
3083 * because unused but enabled components will cause the dataport to perform
3084 * additional atomic operations on the addresses that happen to be in the
3085 * uninitialized Y, Z and W coordinates of the payload.
3086 */
3087 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3088
3089 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3090 payload, surface, desc);
3091 }
3092
3093 void
3094 brw_untyped_surface_read(struct brw_codegen *p,
3095 struct brw_reg dst,
3096 struct brw_reg payload,
3097 struct brw_reg surface,
3098 unsigned msg_length,
3099 unsigned num_channels)
3100 {
3101 const struct gen_device_info *devinfo = p->devinfo;
3102 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3103 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3104 GEN7_SFID_DATAPORT_DATA_CACHE);
3105 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3106 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3107 const unsigned response_length =
3108 brw_surface_payload_size(p, num_channels, exec_size);
3109 const unsigned desc =
3110 brw_message_desc(devinfo, msg_length, response_length, false) |
3111 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3112
3113 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3114 }
3115
3116 void
3117 brw_untyped_surface_write(struct brw_codegen *p,
3118 struct brw_reg payload,
3119 struct brw_reg surface,
3120 unsigned msg_length,
3121 unsigned num_channels,
3122 bool header_present)
3123 {
3124 const struct gen_device_info *devinfo = p->devinfo;
3125 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3126 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3127 GEN7_SFID_DATAPORT_DATA_CACHE);
3128 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3129 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3130 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3131 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3132 has_simd4x2 ? 0 : 8;
3133 const unsigned desc =
3134 brw_message_desc(devinfo, msg_length, 0, header_present) |
3135 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3136 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3137 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3138
3139 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3140 payload, surface, desc);
3141 }
3142
3143 static void
3144 brw_set_memory_fence_message(struct brw_codegen *p,
3145 struct brw_inst *insn,
3146 enum brw_message_target sfid,
3147 bool commit_enable,
3148 unsigned bti)
3149 {
3150 const struct gen_device_info *devinfo = p->devinfo;
3151
3152 brw_set_desc(p, insn, brw_message_desc(
3153 devinfo, 1, (commit_enable ? 1 : 0), true));
3154
3155 brw_inst_set_sfid(devinfo, insn, sfid);
3156
3157 switch (sfid) {
3158 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3159 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3160 break;
3161 case GEN7_SFID_DATAPORT_DATA_CACHE:
3162 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3163 break;
3164 default:
3165 unreachable("Not reached");
3166 }
3167
3168 if (commit_enable)
3169 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3170
3171 assert(devinfo->gen >= 11 || bti == 0);
3172 brw_inst_set_binding_table_index(devinfo, insn, bti);
3173 }
3174
3175 void
3176 brw_memory_fence(struct brw_codegen *p,
3177 struct brw_reg dst,
3178 struct brw_reg src,
3179 enum opcode send_op,
3180 bool stall,
3181 unsigned bti)
3182 {
3183 const struct gen_device_info *devinfo = p->devinfo;
3184 const bool commit_enable = stall ||
3185 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3186 (devinfo->gen == 7 && !devinfo->is_haswell);
3187 struct brw_inst *insn;
3188
3189 brw_push_insn_state(p);
3190 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3191 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3192 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3193 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3194
3195 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3196 * message doesn't write anything back.
3197 */
3198 insn = next_insn(p, send_op);
3199 brw_set_dest(p, insn, dst);
3200 brw_set_src0(p, insn, src);
3201 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3202 commit_enable, bti);
3203
3204 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3205 /* IVB does typed surface access through the render cache, so we need to
3206 * flush it too. Use a different register so both flushes can be
3207 * pipelined by the hardware.
3208 */
3209 insn = next_insn(p, send_op);
3210 brw_set_dest(p, insn, offset(dst, 1));
3211 brw_set_src0(p, insn, src);
3212 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3213 commit_enable, bti);
3214
3215 /* Now write the response of the second message into the response of the
3216 * first to trigger a pipeline stall -- This way future render and data
3217 * cache messages will be properly ordered with respect to past data and
3218 * render cache messages.
3219 */
3220 brw_MOV(p, dst, offset(dst, 1));
3221 }
3222
3223 if (stall) {
3224 brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_DST,
3225 brw_get_default_swsb(p).sbid));
3226
3227 brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
3228 }
3229
3230 brw_pop_insn_state(p);
3231 }
3232
3233 void
3234 brw_pixel_interpolator_query(struct brw_codegen *p,
3235 struct brw_reg dest,
3236 struct brw_reg mrf,
3237 bool noperspective,
3238 unsigned mode,
3239 struct brw_reg data,
3240 unsigned msg_length,
3241 unsigned response_length)
3242 {
3243 const struct gen_device_info *devinfo = p->devinfo;
3244 const uint16_t exec_size = brw_get_default_exec_size(p);
3245 const unsigned slot_group = brw_get_default_group(p) / 16;
3246 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3247 const unsigned desc =
3248 brw_message_desc(devinfo, msg_length, response_length, false) |
3249 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3250 slot_group);
3251
3252 /* brw_send_indirect_message will automatically use a direct send message
3253 * if data is actually immediate.
3254 */
3255 brw_send_indirect_message(p,
3256 GEN7_SFID_PIXEL_INTERPOLATOR,
3257 dest,
3258 mrf,
3259 vec1(data),
3260 desc,
3261 false);
3262 }
3263
3264 void
3265 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3266 struct brw_reg mask)
3267 {
3268 const struct gen_device_info *devinfo = p->devinfo;
3269 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3270 const unsigned qtr_control = brw_get_default_group(p) / 8;
3271 brw_inst *inst;
3272
3273 assert(devinfo->gen >= 7);
3274 assert(mask.type == BRW_REGISTER_TYPE_UD);
3275
3276 brw_push_insn_state(p);
3277
3278 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3279 * unnecessary bits in the instruction words, get the information we need
3280 * and reset the default flag register. This allows more instructions to be
3281 * compacted.
3282 */
3283 const unsigned flag_subreg = p->current->flag_subreg;
3284 brw_set_default_flag_reg(p, 0, 0);
3285
3286 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3287 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3288
3289 if (devinfo->gen >= 8) {
3290 /* Getting the first active channel index is easy on Gen8: Just find
3291 * the first bit set in the execution mask. The register exists on
3292 * HSW already but it reads back as all ones when the current
3293 * instruction has execution masking disabled, so it's kind of
3294 * useless.
3295 */
3296 struct brw_reg exec_mask =
3297 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3298
3299 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3300 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3301 /* Unfortunately, ce0 does not take into account the thread
3302 * dispatch mask, which may be a problem in cases where it's not
3303 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3304 * some n). Combine ce0 with the given dispatch (or vector) mask
3305 * to mask off those channels which were never dispatched by the
3306 * hardware.
3307 */
3308 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3309 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3310 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3311 exec_mask = vec1(dst);
3312 }
3313
3314 /* Quarter control has the effect of magically shifting the value of
3315 * ce0 so you'll get the first active channel relative to the
3316 * specified quarter control as result.
3317 */
3318 inst = brw_FBL(p, vec1(dst), exec_mask);
3319 } else {
3320 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3321
3322 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3323 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3324
3325 /* Run enough instructions returning zero with execution masking and
3326 * a conditional modifier enabled in order to get the full execution
3327 * mask in f1.0. We could use a single 32-wide move here if it
3328 * weren't because of the hardware bug that causes channel enables to
3329 * be applied incorrectly to the second half of 32-wide instructions
3330 * on Gen7.
3331 */
3332 const unsigned lower_size = MIN2(16, exec_size);
3333 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3334 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3335 brw_imm_uw(0));
3336 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3337 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3338 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3339 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3340 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3341 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3342 }
3343
3344 /* Find the first bit set in the exec_size-wide portion of the flag
3345 * register that was updated by the last sequence of MOV
3346 * instructions.
3347 */
3348 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3349 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3350 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3351 }
3352 } else {
3353 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3354
3355 if (devinfo->gen >= 8 &&
3356 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3357 /* In SIMD4x2 mode the first active channel index is just the
3358 * negation of the first bit of the mask register. Note that ce0
3359 * doesn't take into account the dispatch mask, so the Gen7 path
3360 * should be used instead unless you have the guarantee that the
3361 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3362 * for some n).
3363 */
3364 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3365 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3366 brw_imm_ud(1));
3367
3368 } else {
3369 /* Overwrite the destination without and with execution masking to
3370 * find out which of the channels is active.
3371 */
3372 brw_push_insn_state(p);
3373 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3374 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3375 brw_imm_ud(1));
3376
3377 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3378 brw_imm_ud(0));
3379 brw_pop_insn_state(p);
3380 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3381 }
3382 }
3383
3384 brw_pop_insn_state(p);
3385 }
3386
3387 void
3388 brw_broadcast(struct brw_codegen *p,
3389 struct brw_reg dst,
3390 struct brw_reg src,
3391 struct brw_reg idx)
3392 {
3393 const struct gen_device_info *devinfo = p->devinfo;
3394 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3395 brw_inst *inst;
3396
3397 brw_push_insn_state(p);
3398 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3399 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3400
3401 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3402 src.address_mode == BRW_ADDRESS_DIRECT);
3403 assert(!src.abs && !src.negate);
3404 assert(src.type == dst.type);
3405
3406 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3407 idx.file == BRW_IMMEDIATE_VALUE) {
3408 /* Trivial, the source is already uniform or the index is a constant.
3409 * We will typically not get here if the optimizer is doing its job, but
3410 * asserting would be mean.
3411 */
3412 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3413 brw_MOV(p, dst,
3414 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3415 stride(suboffset(src, 4 * i), 0, 4, 1)));
3416 } else {
3417 /* From the Haswell PRM section "Register Region Restrictions":
3418 *
3419 * "The lower bits of the AddressImmediate must not overflow to
3420 * change the register address. The lower 5 bits of Address
3421 * Immediate when added to lower 5 bits of address register gives
3422 * the sub-register offset. The upper bits of Address Immediate
3423 * when added to upper bits of address register gives the register
3424 * address. Any overflow from sub-register offset is dropped."
3425 *
3426 * Fortunately, for broadcast, we never have a sub-register offset so
3427 * this isn't an issue.
3428 */
3429 assert(src.subnr == 0);
3430
3431 if (align1) {
3432 const struct brw_reg addr =
3433 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3434 unsigned offset = src.nr * REG_SIZE + src.subnr;
3435 /* Limit in bytes of the signed indirect addressing immediate. */
3436 const unsigned limit = 512;
3437
3438 brw_push_insn_state(p);
3439 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3440 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3441
3442 /* Take into account the component size and horizontal stride. */
3443 assert(src.vstride == src.hstride + src.width);
3444 brw_SHL(p, addr, vec1(idx),
3445 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3446 src.hstride - 1));
3447
3448 /* We can only address up to limit bytes using the indirect
3449 * addressing immediate, account for the difference if the source
3450 * register is above this limit.
3451 */
3452 if (offset >= limit) {
3453 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3454 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3455 offset = offset % limit;
3456 }
3457
3458 brw_pop_insn_state(p);
3459
3460 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3461
3462 /* Use indirect addressing to fetch the specified component. */
3463 if (type_sz(src.type) > 4 &&
3464 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3465 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3466 *
3467 * "When source or destination datatype is 64b or operation is
3468 * integer DWord multiply, indirect addressing must not be
3469 * used."
3470 *
3471 * To work around both of this issue, we do two integer MOVs
3472 * insead of one 64-bit MOV. Because no double value should ever
3473 * cross a register boundary, it's safe to use the immediate
3474 * offset in the indirect here to handle adding 4 bytes to the
3475 * offset and avoid the extra ADD to the register file.
3476 */
3477 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3478 retype(brw_vec1_indirect(addr.subnr, offset),
3479 BRW_REGISTER_TYPE_D));
3480 brw_set_default_swsb(p, tgl_swsb_null());
3481 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3482 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3483 BRW_REGISTER_TYPE_D));
3484 } else {
3485 brw_MOV(p, dst,
3486 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3487 }
3488 } else {
3489 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3490 * to all bits of a flag register,
3491 */
3492 inst = brw_MOV(p,
3493 brw_null_reg(),
3494 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3495 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3496 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3497 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3498
3499 /* and use predicated SEL to pick the right channel. */
3500 inst = brw_SEL(p, dst,
3501 stride(suboffset(src, 4), 4, 4, 1),
3502 stride(src, 4, 4, 1));
3503 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3504 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3505 }
3506 }
3507
3508 brw_pop_insn_state(p);
3509 }
3510
3511 /**
3512 * This instruction is generated as a single-channel align1 instruction by
3513 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3514 *
3515 * We can't use the typed atomic op in the FS because that has the execution
3516 * mask ANDed with the pixel mask, but we just want to write the one dword for
3517 * all the pixels.
3518 *
3519 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3520 * one u32. So we use the same untyped atomic write message as the pixel
3521 * shader.
3522 *
3523 * The untyped atomic operation requires a BUFFER surface type with RAW
3524 * format, and is only accessible through the legacy DATA_CACHE dataport
3525 * messages.
3526 */
3527 void brw_shader_time_add(struct brw_codegen *p,
3528 struct brw_reg payload,
3529 uint32_t surf_index)
3530 {
3531 const struct gen_device_info *devinfo = p->devinfo;
3532 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3533 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3534 GEN7_SFID_DATAPORT_DATA_CACHE);
3535 assert(devinfo->gen >= 7);
3536
3537 brw_push_insn_state(p);
3538 brw_set_default_access_mode(p, BRW_ALIGN_1);
3539 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3540 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3541 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3542
3543 /* We use brw_vec1_reg and unmasked because we want to increment the given
3544 * offset only once.
3545 */
3546 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3547 BRW_ARF_NULL, 0));
3548 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3549 payload.nr, 0));
3550 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3551 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3552 false)));
3553
3554 brw_inst_set_sfid(devinfo, send, sfid);
3555 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3556
3557 brw_pop_insn_state(p);
3558 }
3559
3560
3561 /**
3562 * Emit the SEND message for a barrier
3563 */
3564 void
3565 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3566 {
3567 const struct gen_device_info *devinfo = p->devinfo;
3568 struct brw_inst *inst;
3569
3570 assert(devinfo->gen >= 7);
3571
3572 brw_push_insn_state(p);
3573 brw_set_default_access_mode(p, BRW_ALIGN_1);
3574 inst = next_insn(p, BRW_OPCODE_SEND);
3575 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3576 brw_set_src0(p, inst, src);
3577 brw_set_src1(p, inst, brw_null_reg());
3578 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3579
3580 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3581 brw_inst_set_gateway_subfuncid(devinfo, inst,
3582 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3583
3584 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3585 brw_pop_insn_state(p);
3586 }
3587
3588
3589 /**
3590 * Emit the wait instruction for a barrier
3591 */
3592 void
3593 brw_WAIT(struct brw_codegen *p)
3594 {
3595 const struct gen_device_info *devinfo = p->devinfo;
3596 struct brw_inst *insn;
3597
3598 struct brw_reg src = brw_notification_reg();
3599
3600 insn = next_insn(p, BRW_OPCODE_WAIT);
3601 brw_set_dest(p, insn, src);
3602 brw_set_src0(p, insn, src);
3603 brw_set_src1(p, insn, brw_null_reg());
3604
3605 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3606 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3607 }
3608
3609 void
3610 brw_float_controls_mode(struct brw_codegen *p,
3611 unsigned mode, unsigned mask)
3612 {
3613 /* From the Skylake PRM, Volume 7, page 760:
3614 * "Implementation Restriction on Register Access: When the control
3615 * register is used as an explicit source and/or destination, hardware
3616 * does not ensure execution pipeline coherency. Software must set the
3617 * thread control field to ‘switch’ for an instruction that uses
3618 * control register as an explicit operand."
3619 *
3620 * On Gen12+ this is implemented in terms of SWSB annotations instead.
3621 */
3622 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3623
3624 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3625 brw_imm_ud(~mask));
3626 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3627 if (p->devinfo->gen < 12)
3628 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3629
3630 if (mode) {
3631 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3632 brw_imm_ud(mode));
3633 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3634 if (p->devinfo->gen < 12)
3635 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3636 }
3637
3638 if (p->devinfo->gen >= 12)
3639 brw_SYNC(p, TGL_SYNC_NOP);
3640 }