intel/eu/gen12: Don't set DD control, it's gone.
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 /* The hardware has a restriction where if the destination is Byte,
98 * the instruction needs to have a stride of 2 (except for packed byte
99 * MOV). This seems to be required even if the destination is the NULL
100 * register.
101 */
102 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
103 dest.nr == BRW_ARF_NULL &&
104 type_sz(dest.type) == 1) {
105 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
106 }
107
108 gen7_convert_mrf_to_grf(p, &dest);
109
110 if (devinfo->gen >= 12 &&
111 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
112 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
113 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
115 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
116 assert(dest.subnr == 0);
117 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
118 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
119 dest.vstride == dest.width + 1));
120 assert(!dest.negate && !dest.abs);
121 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
122 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
123
124 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
125 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
126 assert(devinfo->gen < 12);
127 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
128 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
129 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
130 assert(dest.subnr % 16 == 0);
131 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
132 dest.vstride == dest.width + 1);
133 assert(!dest.negate && !dest.abs);
134 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
135 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
136 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
137 } else {
138 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
139 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
140
141 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
142 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
143
144 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
145 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
146 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
147 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
148 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
149 } else {
150 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
151 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
152 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
153 dest.file == BRW_MESSAGE_REGISTER_FILE) {
154 assert(dest.writemask != 0);
155 }
156 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
157 * Although Dst.HorzStride is a don't care for Align16, HW needs
158 * this to be programmed as "01".
159 */
160 brw_inst_set_dst_hstride(devinfo, inst, 1);
161 }
162 } else {
163 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
164
165 /* These are different sizes in align1 vs align16:
166 */
167 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
168 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
169 dest.indirect_offset);
170 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
171 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
172 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
173 } else {
174 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
175 dest.indirect_offset);
176 /* even ignored in da16, still need to set as '01' */
177 brw_inst_set_dst_hstride(devinfo, inst, 1);
178 }
179 }
180 }
181
182 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
183 * or 16 (SIMD16), as that's normally correct. However, when dealing with
184 * small registers, it can be useful for us to automatically reduce it to
185 * match the register size.
186 */
187 if (p->automatic_exec_sizes) {
188 /*
189 * In platforms that support fp64 we can emit instructions with a width
190 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
191 * these cases we need to make sure that these instructions have their
192 * exec sizes set properly when they are emitted and we can't rely on
193 * this code to fix it.
194 */
195 bool fix_exec_size;
196 if (devinfo->gen >= 6)
197 fix_exec_size = dest.width < BRW_EXECUTE_4;
198 else
199 fix_exec_size = dest.width < BRW_EXECUTE_8;
200
201 if (fix_exec_size)
202 brw_inst_set_exec_size(devinfo, inst, dest.width);
203 }
204 }
205
206 void
207 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
208 {
209 const struct gen_device_info *devinfo = p->devinfo;
210
211 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
212 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
213 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
214 assert(reg.nr < 128);
215
216 gen7_convert_mrf_to_grf(p, &reg);
217
218 if (devinfo->gen >= 6 &&
219 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
220 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
221 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
223 /* Any source modifiers or regions will be ignored, since this just
224 * identifies the MRF/GRF to start reading the message contents from.
225 * Check for some likely failures.
226 */
227 assert(!reg.negate);
228 assert(!reg.abs);
229 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
230 }
231
232 if (devinfo->gen >= 12 &&
233 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
234 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
235 assert(reg.file != BRW_IMMEDIATE_VALUE);
236 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
237 assert(reg.subnr == 0);
238 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
239 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
240 reg.vstride == reg.width + 1));
241 assert(!reg.negate && !reg.abs);
242 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
243 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
244
245 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
246 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
247 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
248 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
249 assert(reg.subnr % 16 == 0);
250 assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
251 reg.vstride == reg.width + 1);
252 assert(!reg.negate && !reg.abs);
253 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
254 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
255 } else {
256 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
257 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
258 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
259 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
260
261 if (reg.file == BRW_IMMEDIATE_VALUE) {
262 if (reg.type == BRW_REGISTER_TYPE_DF ||
263 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
264 brw_inst_set_imm_df(devinfo, inst, reg.df);
265 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
266 reg.type == BRW_REGISTER_TYPE_Q)
267 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
268 else
269 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
270
271 if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
272 brw_inst_set_src1_reg_file(devinfo, inst,
273 BRW_ARCHITECTURE_REGISTER_FILE);
274 brw_inst_set_src1_reg_hw_type(devinfo, inst,
275 brw_inst_src0_reg_hw_type(devinfo, inst));
276 }
277 } else {
278 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
279 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
280 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
281 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
282 } else {
283 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
284 }
285 } else {
286 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
287
288 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
289 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
290 } else {
291 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
292 }
293 }
294
295 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
296 if (reg.width == BRW_WIDTH_1 &&
297 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
298 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
299 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
300 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
301 } else {
302 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
303 brw_inst_set_src0_width(devinfo, inst, reg.width);
304 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
305 }
306 } else {
307 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
308 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
309 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
310 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
311 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
312 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
313 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
314 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
315
316 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
317 /* This is an oddity of the fact we're using the same
318 * descriptions for registers in align_16 as align_1:
319 */
320 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
321 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
322 reg.type == BRW_REGISTER_TYPE_DF &&
323 reg.vstride == BRW_VERTICAL_STRIDE_2) {
324 /* From SNB PRM:
325 *
326 * "For Align16 access mode, only encodings of 0000 and 0011
327 * are allowed. Other codes are reserved."
328 *
329 * Presumably the DevSNB behavior applies to IVB as well.
330 */
331 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
332 } else {
333 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
334 }
335 }
336 }
337 }
338 }
339
340
341 void
342 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
343 {
344 const struct gen_device_info *devinfo = p->devinfo;
345
346 if (reg.file == BRW_GENERAL_REGISTER_FILE)
347 assert(reg.nr < 128);
348
349 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
350 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
351 (devinfo->gen >= 12 &&
352 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
353 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
354 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
355 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
356 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
357 assert(reg.subnr == 0);
358 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
359 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
360 reg.vstride == reg.width + 1));
361 assert(!reg.negate && !reg.abs);
362 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
363 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
364 } else {
365 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
366 *
367 * "Accumulator registers may be accessed explicitly as src0
368 * operands only."
369 */
370 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
371 reg.nr != BRW_ARF_ACCUMULATOR);
372
373 gen7_convert_mrf_to_grf(p, &reg);
374 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
375
376 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
377 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
378 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
379
380 /* Only src1 can be immediate in two-argument instructions.
381 */
382 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
383
384 if (reg.file == BRW_IMMEDIATE_VALUE) {
385 /* two-argument instructions can only use 32-bit immediates */
386 assert(type_sz(reg.type) < 8);
387 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
388 } else {
389 /* This is a hardware restriction, which may or may not be lifted
390 * in the future:
391 */
392 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
393 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
394
395 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
396 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
397 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
398 } else {
399 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
400 }
401
402 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
403 if (reg.width == BRW_WIDTH_1 &&
404 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
405 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
406 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
407 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
408 } else {
409 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
410 brw_inst_set_src1_width(devinfo, inst, reg.width);
411 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
412 }
413 } else {
414 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
415 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
416 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
417 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
418 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
419 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
420 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
421 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
422
423 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
424 /* This is an oddity of the fact we're using the same
425 * descriptions for registers in align_16 as align_1:
426 */
427 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
428 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
429 reg.type == BRW_REGISTER_TYPE_DF &&
430 reg.vstride == BRW_VERTICAL_STRIDE_2) {
431 /* From SNB PRM:
432 *
433 * "For Align16 access mode, only encodings of 0000 and 0011
434 * are allowed. Other codes are reserved."
435 *
436 * Presumably the DevSNB behavior applies to IVB as well.
437 */
438 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
439 } else {
440 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
441 }
442 }
443 }
444 }
445 }
446
447 /**
448 * Specify the descriptor and extended descriptor immediate for a SEND(C)
449 * message instruction.
450 */
451 void
452 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
453 unsigned desc, unsigned ex_desc)
454 {
455 const struct gen_device_info *devinfo = p->devinfo;
456 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
457 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
458 if (devinfo->gen < 12)
459 brw_inst_set_src1_file_type(devinfo, inst,
460 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
461 brw_inst_set_send_desc(devinfo, inst, desc);
462 if (devinfo->gen >= 9)
463 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
464 }
465
466 static void brw_set_math_message( struct brw_codegen *p,
467 brw_inst *inst,
468 unsigned function,
469 unsigned integer_type,
470 bool low_precision,
471 unsigned dataType )
472 {
473 const struct gen_device_info *devinfo = p->devinfo;
474 unsigned msg_length;
475 unsigned response_length;
476
477 /* Infer message length from the function */
478 switch (function) {
479 case BRW_MATH_FUNCTION_POW:
480 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
481 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
482 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
483 msg_length = 2;
484 break;
485 default:
486 msg_length = 1;
487 break;
488 }
489
490 /* Infer response length from the function */
491 switch (function) {
492 case BRW_MATH_FUNCTION_SINCOS:
493 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
494 response_length = 2;
495 break;
496 default:
497 response_length = 1;
498 break;
499 }
500
501 brw_set_desc(p, inst, brw_message_desc(
502 devinfo, msg_length, response_length, false));
503
504 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
505 brw_inst_set_math_msg_function(devinfo, inst, function);
506 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
507 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
508 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
509 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
510 brw_inst_set_saturate(devinfo, inst, 0);
511 }
512
513
514 static void brw_set_ff_sync_message(struct brw_codegen *p,
515 brw_inst *insn,
516 bool allocate,
517 unsigned response_length,
518 bool end_of_thread)
519 {
520 const struct gen_device_info *devinfo = p->devinfo;
521
522 brw_set_desc(p, insn, brw_message_desc(
523 devinfo, 1, response_length, true));
524
525 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
526 brw_inst_set_eot(devinfo, insn, end_of_thread);
527 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
528 brw_inst_set_urb_allocate(devinfo, insn, allocate);
529 /* The following fields are not used by FF_SYNC: */
530 brw_inst_set_urb_global_offset(devinfo, insn, 0);
531 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
532 brw_inst_set_urb_used(devinfo, insn, 0);
533 brw_inst_set_urb_complete(devinfo, insn, 0);
534 }
535
536 static void brw_set_urb_message( struct brw_codegen *p,
537 brw_inst *insn,
538 enum brw_urb_write_flags flags,
539 unsigned msg_length,
540 unsigned response_length,
541 unsigned offset,
542 unsigned swizzle_control )
543 {
544 const struct gen_device_info *devinfo = p->devinfo;
545
546 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
547 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
548 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
549
550 brw_set_desc(p, insn, brw_message_desc(
551 devinfo, msg_length, response_length, true));
552
553 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
554 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
555
556 if (flags & BRW_URB_WRITE_OWORD) {
557 assert(msg_length == 2); /* header + one OWORD of data */
558 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
559 } else {
560 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
561 }
562
563 brw_inst_set_urb_global_offset(devinfo, insn, offset);
564 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
565
566 if (devinfo->gen < 8) {
567 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
568 }
569
570 if (devinfo->gen < 7) {
571 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
572 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
573 } else {
574 brw_inst_set_urb_per_slot_offset(devinfo, insn,
575 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
576 }
577 }
578
579 static void
580 gen7_set_dp_scratch_message(struct brw_codegen *p,
581 brw_inst *inst,
582 bool write,
583 bool dword,
584 bool invalidate_after_read,
585 unsigned num_regs,
586 unsigned addr_offset,
587 unsigned mlen,
588 unsigned rlen,
589 bool header_present)
590 {
591 const struct gen_device_info *devinfo = p->devinfo;
592 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
593 (devinfo->gen >= 8 && num_regs == 8));
594 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
595 num_regs - 1);
596
597 brw_set_desc(p, inst, brw_message_desc(
598 devinfo, mlen, rlen, header_present));
599
600 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
601 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
602 brw_inst_set_scratch_read_write(devinfo, inst, write);
603 brw_inst_set_scratch_type(devinfo, inst, dword);
604 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
605 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
606 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
607 }
608
609 static void
610 brw_inst_set_state(const struct gen_device_info *devinfo,
611 brw_inst *insn,
612 const struct brw_insn_state *state)
613 {
614 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
615 brw_inst_set_group(devinfo, insn, state->group);
616 brw_inst_set_compression(devinfo, insn, state->compressed);
617 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
618 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
619 brw_inst_set_saturate(devinfo, insn, state->saturate);
620 brw_inst_set_pred_control(devinfo, insn, state->predicate);
621 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
622
623 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
624 state->access_mode == BRW_ALIGN_16) {
625 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
626 if (devinfo->gen >= 7)
627 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
628 } else {
629 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
630 if (devinfo->gen >= 7)
631 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
632 }
633
634 if (devinfo->gen >= 6)
635 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
636 }
637
638 #define next_insn brw_next_insn
639 brw_inst *
640 brw_next_insn(struct brw_codegen *p, unsigned opcode)
641 {
642 const struct gen_device_info *devinfo = p->devinfo;
643 brw_inst *insn;
644
645 if (p->nr_insn + 1 > p->store_size) {
646 p->store_size <<= 1;
647 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
648 }
649
650 p->next_insn_offset += 16;
651 insn = &p->store[p->nr_insn++];
652
653 memset(insn, 0, sizeof(*insn));
654 brw_inst_set_opcode(devinfo, insn, opcode);
655
656 /* Apply the default instruction state */
657 brw_inst_set_state(devinfo, insn, p->current);
658
659 return insn;
660 }
661
662 static brw_inst *
663 brw_alu1(struct brw_codegen *p, unsigned opcode,
664 struct brw_reg dest, struct brw_reg src)
665 {
666 brw_inst *insn = next_insn(p, opcode);
667 brw_set_dest(p, insn, dest);
668 brw_set_src0(p, insn, src);
669 return insn;
670 }
671
672 static brw_inst *
673 brw_alu2(struct brw_codegen *p, unsigned opcode,
674 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
675 {
676 /* 64-bit immediates are only supported on 1-src instructions */
677 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
678 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
679
680 brw_inst *insn = next_insn(p, opcode);
681 brw_set_dest(p, insn, dest);
682 brw_set_src0(p, insn, src0);
683 brw_set_src1(p, insn, src1);
684 return insn;
685 }
686
687 static int
688 get_3src_subreg_nr(struct brw_reg reg)
689 {
690 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
691 * use 32-bit units (components 0..7). Since they only support F/D/UD
692 * types, this doesn't lose any flexibility, but uses fewer bits.
693 */
694 return reg.subnr / 4;
695 }
696
697 static enum gen10_align1_3src_vertical_stride
698 to_3src_align1_vstride(const struct gen_device_info *devinfo,
699 enum brw_vertical_stride vstride)
700 {
701 switch (vstride) {
702 case BRW_VERTICAL_STRIDE_0:
703 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
704 case BRW_VERTICAL_STRIDE_1:
705 assert(devinfo->gen >= 12);
706 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
707 case BRW_VERTICAL_STRIDE_2:
708 assert(devinfo->gen < 12);
709 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
710 case BRW_VERTICAL_STRIDE_4:
711 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
712 case BRW_VERTICAL_STRIDE_8:
713 case BRW_VERTICAL_STRIDE_16:
714 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
715 default:
716 unreachable("invalid vstride");
717 }
718 }
719
720
721 static enum gen10_align1_3src_src_horizontal_stride
722 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
723 {
724 switch (hstride) {
725 case BRW_HORIZONTAL_STRIDE_0:
726 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
727 case BRW_HORIZONTAL_STRIDE_1:
728 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
729 case BRW_HORIZONTAL_STRIDE_2:
730 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
731 case BRW_HORIZONTAL_STRIDE_4:
732 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
733 default:
734 unreachable("invalid hstride");
735 }
736 }
737
738 static brw_inst *
739 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
740 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
741 {
742 const struct gen_device_info *devinfo = p->devinfo;
743 brw_inst *inst = next_insn(p, opcode);
744
745 gen7_convert_mrf_to_grf(p, &dest);
746
747 assert(dest.nr < 128);
748 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
749 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
750 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
751 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
752 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
753 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
754 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
755
756 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
757 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
758 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
759
760 if (devinfo->gen >= 12) {
761 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
762 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
763 } else {
764 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
765 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
766 BRW_ALIGN1_3SRC_ACCUMULATOR);
767 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
768 } else {
769 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
770 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
771 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
772 }
773 }
774 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
775
776 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
777
778 if (brw_reg_type_is_floating_point(dest.type)) {
779 brw_inst_set_3src_a1_exec_type(devinfo, inst,
780 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
781 } else {
782 brw_inst_set_3src_a1_exec_type(devinfo, inst,
783 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
784 }
785
786 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
787 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
788 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
789 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
790
791 brw_inst_set_3src_a1_src0_vstride(
792 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
793 brw_inst_set_3src_a1_src1_vstride(
794 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
795 /* no vstride on src2 */
796
797 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
798 to_3src_align1_hstride(src0.hstride));
799 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
800 to_3src_align1_hstride(src1.hstride));
801 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
802 to_3src_align1_hstride(src2.hstride));
803
804 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
805 if (src0.type == BRW_REGISTER_TYPE_NF) {
806 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
807 } else {
808 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
809 }
810 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
811 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
812
813 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
814 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
815 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
816 } else {
817 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
818 }
819 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
820 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
821
822 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
823 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
824 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
825 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
826
827 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
828 src0.file == BRW_IMMEDIATE_VALUE ||
829 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
830 src0.type == BRW_REGISTER_TYPE_NF));
831 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
832 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
833 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
834 src2.file == BRW_IMMEDIATE_VALUE);
835
836 if (devinfo->gen >= 12) {
837 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
838 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
839 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
840 } else {
841 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
842 src0.file == BRW_GENERAL_REGISTER_FILE ?
843 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
844 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
845 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
846 src1.file == BRW_GENERAL_REGISTER_FILE ?
847 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
848 BRW_ALIGN1_3SRC_ACCUMULATOR);
849 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
850 src2.file == BRW_GENERAL_REGISTER_FILE ?
851 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
852 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
853 }
854
855 } else {
856 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
857 dest.file == BRW_MESSAGE_REGISTER_FILE);
858 assert(dest.type == BRW_REGISTER_TYPE_F ||
859 dest.type == BRW_REGISTER_TYPE_DF ||
860 dest.type == BRW_REGISTER_TYPE_D ||
861 dest.type == BRW_REGISTER_TYPE_UD ||
862 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
863 if (devinfo->gen == 6) {
864 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
865 dest.file == BRW_MESSAGE_REGISTER_FILE);
866 }
867 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
868 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
869 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
870
871 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
872 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
873 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
874 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
875 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
876 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
877 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
878 src0.vstride == BRW_VERTICAL_STRIDE_0);
879
880 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
881 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
882 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
883 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
884 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
885 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
886 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
887 src1.vstride == BRW_VERTICAL_STRIDE_0);
888
889 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
890 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
891 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
892 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
893 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
894 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
895 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
896 src2.vstride == BRW_VERTICAL_STRIDE_0);
897
898 if (devinfo->gen >= 7) {
899 /* Set both the source and destination types based on dest.type,
900 * ignoring the source register types. The MAD and LRP emitters ensure
901 * that all four types are float. The BFE and BFI2 emitters, however,
902 * may send us mixed D and UD types and want us to ignore that and use
903 * the destination type.
904 */
905 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
906 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
907
908 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
909 *
910 * "Three source instructions can use operands with mixed-mode
911 * precision. When SrcType field is set to :f or :hf it defines
912 * precision for source 0 only, and fields Src1Type and Src2Type
913 * define precision for other source operands:
914 *
915 * 0b = :f. Single precision Float (32-bit).
916 * 1b = :hf. Half precision Float (16-bit)."
917 */
918 if (src1.type == BRW_REGISTER_TYPE_HF)
919 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
920
921 if (src2.type == BRW_REGISTER_TYPE_HF)
922 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
923 }
924 }
925
926 return inst;
927 }
928
929
930 /***********************************************************************
931 * Convenience routines.
932 */
933 #define ALU1(OP) \
934 brw_inst *brw_##OP(struct brw_codegen *p, \
935 struct brw_reg dest, \
936 struct brw_reg src0) \
937 { \
938 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
939 }
940
941 #define ALU2(OP) \
942 brw_inst *brw_##OP(struct brw_codegen *p, \
943 struct brw_reg dest, \
944 struct brw_reg src0, \
945 struct brw_reg src1) \
946 { \
947 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
948 }
949
950 #define ALU3(OP) \
951 brw_inst *brw_##OP(struct brw_codegen *p, \
952 struct brw_reg dest, \
953 struct brw_reg src0, \
954 struct brw_reg src1, \
955 struct brw_reg src2) \
956 { \
957 if (p->current->access_mode == BRW_ALIGN_16) { \
958 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
959 src0.swizzle = BRW_SWIZZLE_XXXX; \
960 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
961 src1.swizzle = BRW_SWIZZLE_XXXX; \
962 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
963 src2.swizzle = BRW_SWIZZLE_XXXX; \
964 } \
965 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
966 }
967
968 #define ALU3F(OP) \
969 brw_inst *brw_##OP(struct brw_codegen *p, \
970 struct brw_reg dest, \
971 struct brw_reg src0, \
972 struct brw_reg src1, \
973 struct brw_reg src2) \
974 { \
975 assert(dest.type == BRW_REGISTER_TYPE_F || \
976 dest.type == BRW_REGISTER_TYPE_DF); \
977 if (dest.type == BRW_REGISTER_TYPE_F) { \
978 assert(src0.type == BRW_REGISTER_TYPE_F); \
979 assert(src1.type == BRW_REGISTER_TYPE_F); \
980 assert(src2.type == BRW_REGISTER_TYPE_F); \
981 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
982 assert(src0.type == BRW_REGISTER_TYPE_DF); \
983 assert(src1.type == BRW_REGISTER_TYPE_DF); \
984 assert(src2.type == BRW_REGISTER_TYPE_DF); \
985 } \
986 \
987 if (p->current->access_mode == BRW_ALIGN_16) { \
988 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
989 src0.swizzle = BRW_SWIZZLE_XXXX; \
990 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
991 src1.swizzle = BRW_SWIZZLE_XXXX; \
992 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
993 src2.swizzle = BRW_SWIZZLE_XXXX; \
994 } \
995 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
996 }
997
998 /* Rounding operations (other than RNDD) require two instructions - the first
999 * stores a rounded value (possibly the wrong way) in the dest register, but
1000 * also sets a per-channel "increment bit" in the flag register. A predicated
1001 * add of 1.0 fixes dest to contain the desired result.
1002 *
1003 * Sandybridge and later appear to round correctly without an ADD.
1004 */
1005 #define ROUND(OP) \
1006 void brw_##OP(struct brw_codegen *p, \
1007 struct brw_reg dest, \
1008 struct brw_reg src) \
1009 { \
1010 const struct gen_device_info *devinfo = p->devinfo; \
1011 brw_inst *rnd, *add; \
1012 rnd = next_insn(p, BRW_OPCODE_##OP); \
1013 brw_set_dest(p, rnd, dest); \
1014 brw_set_src0(p, rnd, src); \
1015 \
1016 if (devinfo->gen < 6) { \
1017 /* turn on round-increments */ \
1018 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1019 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1020 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1021 } \
1022 }
1023
1024
1025 ALU2(SEL)
1026 ALU1(NOT)
1027 ALU2(AND)
1028 ALU2(OR)
1029 ALU2(XOR)
1030 ALU2(SHR)
1031 ALU2(SHL)
1032 ALU1(DIM)
1033 ALU2(ASR)
1034 ALU2(ROL)
1035 ALU2(ROR)
1036 ALU3(CSEL)
1037 ALU1(FRC)
1038 ALU1(RNDD)
1039 ALU2(MAC)
1040 ALU2(MACH)
1041 ALU1(LZD)
1042 ALU2(DP4)
1043 ALU2(DPH)
1044 ALU2(DP3)
1045 ALU2(DP2)
1046 ALU3(MAD)
1047 ALU3F(LRP)
1048 ALU1(BFREV)
1049 ALU3(BFE)
1050 ALU2(BFI1)
1051 ALU3(BFI2)
1052 ALU1(FBH)
1053 ALU1(FBL)
1054 ALU1(CBIT)
1055 ALU2(ADDC)
1056 ALU2(SUBB)
1057
1058 ROUND(RNDZ)
1059 ROUND(RNDE)
1060
1061 brw_inst *
1062 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1063 {
1064 const struct gen_device_info *devinfo = p->devinfo;
1065
1066 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1067 * To avoid the problems that causes, we use an <X,2,0> source region to
1068 * read each element twice.
1069 */
1070 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1071 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1072 dest.type == BRW_REGISTER_TYPE_DF &&
1073 (src0.type == BRW_REGISTER_TYPE_F ||
1074 src0.type == BRW_REGISTER_TYPE_D ||
1075 src0.type == BRW_REGISTER_TYPE_UD) &&
1076 !has_scalar_region(src0)) {
1077 assert(src0.vstride == src0.width + src0.hstride);
1078 src0.vstride = src0.hstride;
1079 src0.width = BRW_WIDTH_2;
1080 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1081 }
1082
1083 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1084 }
1085
1086 brw_inst *
1087 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1088 struct brw_reg src0, struct brw_reg src1)
1089 {
1090 /* 6.2.2: add */
1091 if (src0.type == BRW_REGISTER_TYPE_F ||
1092 (src0.file == BRW_IMMEDIATE_VALUE &&
1093 src0.type == BRW_REGISTER_TYPE_VF)) {
1094 assert(src1.type != BRW_REGISTER_TYPE_UD);
1095 assert(src1.type != BRW_REGISTER_TYPE_D);
1096 }
1097
1098 if (src1.type == BRW_REGISTER_TYPE_F ||
1099 (src1.file == BRW_IMMEDIATE_VALUE &&
1100 src1.type == BRW_REGISTER_TYPE_VF)) {
1101 assert(src0.type != BRW_REGISTER_TYPE_UD);
1102 assert(src0.type != BRW_REGISTER_TYPE_D);
1103 }
1104
1105 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1106 }
1107
1108 brw_inst *
1109 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1110 struct brw_reg src0, struct brw_reg src1)
1111 {
1112 assert(dest.type == src0.type);
1113 assert(src0.type == src1.type);
1114 switch (src0.type) {
1115 case BRW_REGISTER_TYPE_B:
1116 case BRW_REGISTER_TYPE_UB:
1117 case BRW_REGISTER_TYPE_W:
1118 case BRW_REGISTER_TYPE_UW:
1119 case BRW_REGISTER_TYPE_D:
1120 case BRW_REGISTER_TYPE_UD:
1121 break;
1122 default:
1123 unreachable("Bad type for brw_AVG");
1124 }
1125
1126 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1127 }
1128
1129 brw_inst *
1130 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1131 struct brw_reg src0, struct brw_reg src1)
1132 {
1133 /* 6.32.38: mul */
1134 if (src0.type == BRW_REGISTER_TYPE_D ||
1135 src0.type == BRW_REGISTER_TYPE_UD ||
1136 src1.type == BRW_REGISTER_TYPE_D ||
1137 src1.type == BRW_REGISTER_TYPE_UD) {
1138 assert(dest.type != BRW_REGISTER_TYPE_F);
1139 }
1140
1141 if (src0.type == BRW_REGISTER_TYPE_F ||
1142 (src0.file == BRW_IMMEDIATE_VALUE &&
1143 src0.type == BRW_REGISTER_TYPE_VF)) {
1144 assert(src1.type != BRW_REGISTER_TYPE_UD);
1145 assert(src1.type != BRW_REGISTER_TYPE_D);
1146 }
1147
1148 if (src1.type == BRW_REGISTER_TYPE_F ||
1149 (src1.file == BRW_IMMEDIATE_VALUE &&
1150 src1.type == BRW_REGISTER_TYPE_VF)) {
1151 assert(src0.type != BRW_REGISTER_TYPE_UD);
1152 assert(src0.type != BRW_REGISTER_TYPE_D);
1153 }
1154
1155 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1156 src0.nr != BRW_ARF_ACCUMULATOR);
1157 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1158 src1.nr != BRW_ARF_ACCUMULATOR);
1159
1160 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1161 }
1162
1163 brw_inst *
1164 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1165 struct brw_reg src0, struct brw_reg src1)
1166 {
1167 src0.vstride = BRW_VERTICAL_STRIDE_0;
1168 src0.width = BRW_WIDTH_1;
1169 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1170 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1171 }
1172
1173 brw_inst *
1174 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1175 struct brw_reg src0, struct brw_reg src1)
1176 {
1177 src0.vstride = BRW_VERTICAL_STRIDE_0;
1178 src0.width = BRW_WIDTH_1;
1179 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1180 src1.vstride = BRW_VERTICAL_STRIDE_8;
1181 src1.width = BRW_WIDTH_8;
1182 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1183 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1184 }
1185
1186 brw_inst *
1187 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1188 {
1189 const struct gen_device_info *devinfo = p->devinfo;
1190 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1191 /* The F32TO16 instruction doesn't support 32-bit destination types in
1192 * Align1 mode, and neither does the Gen8 implementation in terms of a
1193 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1194 * an undocumented feature.
1195 */
1196 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1197 (!align16 || devinfo->gen >= 8));
1198 brw_inst *inst;
1199
1200 if (align16) {
1201 assert(dst.type == BRW_REGISTER_TYPE_UD);
1202 } else {
1203 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1204 dst.type == BRW_REGISTER_TYPE_W ||
1205 dst.type == BRW_REGISTER_TYPE_UW ||
1206 dst.type == BRW_REGISTER_TYPE_HF);
1207 }
1208
1209 brw_push_insn_state(p);
1210
1211 if (needs_zero_fill) {
1212 brw_set_default_access_mode(p, BRW_ALIGN_1);
1213 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1214 }
1215
1216 if (devinfo->gen >= 8) {
1217 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1218 } else {
1219 assert(devinfo->gen == 7);
1220 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1221 }
1222
1223 if (needs_zero_fill) {
1224 if (devinfo->gen < 12)
1225 brw_inst_set_no_dd_clear(devinfo, inst, true);
1226 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1227 if (devinfo->gen < 12)
1228 brw_inst_set_no_dd_check(devinfo, inst, true);
1229 }
1230
1231 brw_pop_insn_state(p);
1232 return inst;
1233 }
1234
1235 brw_inst *
1236 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1237 {
1238 const struct gen_device_info *devinfo = p->devinfo;
1239 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1240
1241 if (align16) {
1242 assert(src.type == BRW_REGISTER_TYPE_UD);
1243 } else {
1244 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1245 *
1246 * Because this instruction does not have a 16-bit floating-point
1247 * type, the source data type must be Word (W). The destination type
1248 * must be F (Float).
1249 */
1250 if (src.type == BRW_REGISTER_TYPE_UD)
1251 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1252
1253 assert(src.type == BRW_REGISTER_TYPE_W ||
1254 src.type == BRW_REGISTER_TYPE_UW ||
1255 src.type == BRW_REGISTER_TYPE_HF);
1256 }
1257
1258 if (devinfo->gen >= 8) {
1259 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1260 } else {
1261 assert(devinfo->gen == 7);
1262 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1263 }
1264 }
1265
1266
1267 void brw_NOP(struct brw_codegen *p)
1268 {
1269 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1270 memset(insn, 0, sizeof(*insn));
1271 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1272 }
1273
1274
1275
1276
1277
1278 /***********************************************************************
1279 * Comparisons, if/else/endif
1280 */
1281
1282 brw_inst *
1283 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1284 unsigned predicate_control)
1285 {
1286 const struct gen_device_info *devinfo = p->devinfo;
1287 struct brw_reg ip = brw_ip_reg();
1288 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1289
1290 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1291 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1292 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1293 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1294
1295 return inst;
1296 }
1297
1298 static void
1299 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1300 {
1301 p->if_stack[p->if_stack_depth] = inst - p->store;
1302
1303 p->if_stack_depth++;
1304 if (p->if_stack_array_size <= p->if_stack_depth) {
1305 p->if_stack_array_size *= 2;
1306 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1307 p->if_stack_array_size);
1308 }
1309 }
1310
1311 static brw_inst *
1312 pop_if_stack(struct brw_codegen *p)
1313 {
1314 p->if_stack_depth--;
1315 return &p->store[p->if_stack[p->if_stack_depth]];
1316 }
1317
1318 static void
1319 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1320 {
1321 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1322 p->loop_stack_array_size *= 2;
1323 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1324 p->loop_stack_array_size);
1325 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1326 p->loop_stack_array_size);
1327 }
1328
1329 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1330 p->loop_stack_depth++;
1331 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1332 }
1333
1334 static brw_inst *
1335 get_inner_do_insn(struct brw_codegen *p)
1336 {
1337 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1338 }
1339
1340 /* EU takes the value from the flag register and pushes it onto some
1341 * sort of a stack (presumably merging with any flag value already on
1342 * the stack). Within an if block, the flags at the top of the stack
1343 * control execution on each channel of the unit, eg. on each of the
1344 * 16 pixel values in our wm programs.
1345 *
1346 * When the matching 'else' instruction is reached (presumably by
1347 * countdown of the instruction count patched in by our ELSE/ENDIF
1348 * functions), the relevant flags are inverted.
1349 *
1350 * When the matching 'endif' instruction is reached, the flags are
1351 * popped off. If the stack is now empty, normal execution resumes.
1352 */
1353 brw_inst *
1354 brw_IF(struct brw_codegen *p, unsigned execute_size)
1355 {
1356 const struct gen_device_info *devinfo = p->devinfo;
1357 brw_inst *insn;
1358
1359 insn = next_insn(p, BRW_OPCODE_IF);
1360
1361 /* Override the defaults for this instruction:
1362 */
1363 if (devinfo->gen < 6) {
1364 brw_set_dest(p, insn, brw_ip_reg());
1365 brw_set_src0(p, insn, brw_ip_reg());
1366 brw_set_src1(p, insn, brw_imm_d(0x0));
1367 } else if (devinfo->gen == 6) {
1368 brw_set_dest(p, insn, brw_imm_w(0));
1369 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1370 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1371 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1372 } else if (devinfo->gen == 7) {
1373 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1374 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1375 brw_set_src1(p, insn, brw_imm_w(0));
1376 brw_inst_set_jip(devinfo, insn, 0);
1377 brw_inst_set_uip(devinfo, insn, 0);
1378 } else {
1379 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1380 if (devinfo->gen < 12)
1381 brw_set_src0(p, insn, brw_imm_d(0));
1382 brw_inst_set_jip(devinfo, insn, 0);
1383 brw_inst_set_uip(devinfo, insn, 0);
1384 }
1385
1386 brw_inst_set_exec_size(devinfo, insn, execute_size);
1387 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1388 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1389 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1390 if (!p->single_program_flow && devinfo->gen < 6)
1391 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1392
1393 push_if_stack(p, insn);
1394 p->if_depth_in_loop[p->loop_stack_depth]++;
1395 return insn;
1396 }
1397
1398 /* This function is only used for gen6-style IF instructions with an
1399 * embedded comparison (conditional modifier). It is not used on gen7.
1400 */
1401 brw_inst *
1402 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1403 struct brw_reg src0, struct brw_reg src1)
1404 {
1405 const struct gen_device_info *devinfo = p->devinfo;
1406 brw_inst *insn;
1407
1408 insn = next_insn(p, BRW_OPCODE_IF);
1409
1410 brw_set_dest(p, insn, brw_imm_w(0));
1411 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1412 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1413 brw_set_src0(p, insn, src0);
1414 brw_set_src1(p, insn, src1);
1415
1416 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1417 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1418 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1419
1420 push_if_stack(p, insn);
1421 return insn;
1422 }
1423
1424 /**
1425 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1426 */
1427 static void
1428 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1429 brw_inst *if_inst, brw_inst *else_inst)
1430 {
1431 const struct gen_device_info *devinfo = p->devinfo;
1432
1433 /* The next instruction (where the ENDIF would be, if it existed) */
1434 brw_inst *next_inst = &p->store[p->nr_insn];
1435
1436 assert(p->single_program_flow);
1437 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1438 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1439 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1440
1441 /* Convert IF to an ADD instruction that moves the instruction pointer
1442 * to the first instruction of the ELSE block. If there is no ELSE
1443 * block, point to where ENDIF would be. Reverse the predicate.
1444 *
1445 * There's no need to execute an ENDIF since we don't need to do any
1446 * stack operations, and if we're currently executing, we just want to
1447 * continue normally.
1448 */
1449 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1450 brw_inst_set_pred_inv(devinfo, if_inst, true);
1451
1452 if (else_inst != NULL) {
1453 /* Convert ELSE to an ADD instruction that points where the ENDIF
1454 * would be.
1455 */
1456 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1457
1458 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1459 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1460 } else {
1461 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1462 }
1463 }
1464
1465 /**
1466 * Patch IF and ELSE instructions with appropriate jump targets.
1467 */
1468 static void
1469 patch_IF_ELSE(struct brw_codegen *p,
1470 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1471 {
1472 const struct gen_device_info *devinfo = p->devinfo;
1473
1474 /* We shouldn't be patching IF and ELSE instructions in single program flow
1475 * mode when gen < 6, because in single program flow mode on those
1476 * platforms, we convert flow control instructions to conditional ADDs that
1477 * operate on IP (see brw_ENDIF).
1478 *
1479 * However, on Gen6, writing to IP doesn't work in single program flow mode
1480 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1481 * not be updated by non-flow control instructions."). And on later
1482 * platforms, there is no significant benefit to converting control flow
1483 * instructions to conditional ADDs. So we do patch IF and ELSE
1484 * instructions in single program flow mode on those platforms.
1485 */
1486 if (devinfo->gen < 6)
1487 assert(!p->single_program_flow);
1488
1489 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1490 assert(endif_inst != NULL);
1491 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1492
1493 unsigned br = brw_jump_scale(devinfo);
1494
1495 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1496 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1497
1498 if (else_inst == NULL) {
1499 /* Patch IF -> ENDIF */
1500 if (devinfo->gen < 6) {
1501 /* Turn it into an IFF, which means no mask stack operations for
1502 * all-false and jumping past the ENDIF.
1503 */
1504 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1505 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1506 br * (endif_inst - if_inst + 1));
1507 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1508 } else if (devinfo->gen == 6) {
1509 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1510 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1511 } else {
1512 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1513 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1514 }
1515 } else {
1516 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1517
1518 /* Patch IF -> ELSE */
1519 if (devinfo->gen < 6) {
1520 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1521 br * (else_inst - if_inst));
1522 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1523 } else if (devinfo->gen == 6) {
1524 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1525 br * (else_inst - if_inst + 1));
1526 }
1527
1528 /* Patch ELSE -> ENDIF */
1529 if (devinfo->gen < 6) {
1530 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1531 * matching ENDIF.
1532 */
1533 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1534 br * (endif_inst - else_inst + 1));
1535 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1536 } else if (devinfo->gen == 6) {
1537 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1538 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1539 br * (endif_inst - else_inst));
1540 } else {
1541 /* The IF instruction's JIP should point just past the ELSE */
1542 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1543 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1544 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1545 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1546 if (devinfo->gen >= 8) {
1547 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1548 * should point to ENDIF.
1549 */
1550 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1551 }
1552 }
1553 }
1554 }
1555
1556 void
1557 brw_ELSE(struct brw_codegen *p)
1558 {
1559 const struct gen_device_info *devinfo = p->devinfo;
1560 brw_inst *insn;
1561
1562 insn = next_insn(p, BRW_OPCODE_ELSE);
1563
1564 if (devinfo->gen < 6) {
1565 brw_set_dest(p, insn, brw_ip_reg());
1566 brw_set_src0(p, insn, brw_ip_reg());
1567 brw_set_src1(p, insn, brw_imm_d(0x0));
1568 } else if (devinfo->gen == 6) {
1569 brw_set_dest(p, insn, brw_imm_w(0));
1570 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1571 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1572 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1573 } else if (devinfo->gen == 7) {
1574 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1575 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1576 brw_set_src1(p, insn, brw_imm_w(0));
1577 brw_inst_set_jip(devinfo, insn, 0);
1578 brw_inst_set_uip(devinfo, insn, 0);
1579 } else {
1580 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1581 if (devinfo->gen < 12)
1582 brw_set_src0(p, insn, brw_imm_d(0));
1583 brw_inst_set_jip(devinfo, insn, 0);
1584 brw_inst_set_uip(devinfo, insn, 0);
1585 }
1586
1587 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1588 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1589 if (!p->single_program_flow && devinfo->gen < 6)
1590 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1591
1592 push_if_stack(p, insn);
1593 }
1594
1595 void
1596 brw_ENDIF(struct brw_codegen *p)
1597 {
1598 const struct gen_device_info *devinfo = p->devinfo;
1599 brw_inst *insn = NULL;
1600 brw_inst *else_inst = NULL;
1601 brw_inst *if_inst = NULL;
1602 brw_inst *tmp;
1603 bool emit_endif = true;
1604
1605 /* In single program flow mode, we can express IF and ELSE instructions
1606 * equivalently as ADD instructions that operate on IP. On platforms prior
1607 * to Gen6, flow control instructions cause an implied thread switch, so
1608 * this is a significant savings.
1609 *
1610 * However, on Gen6, writing to IP doesn't work in single program flow mode
1611 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1612 * not be updated by non-flow control instructions."). And on later
1613 * platforms, there is no significant benefit to converting control flow
1614 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1615 * Gen5.
1616 */
1617 if (devinfo->gen < 6 && p->single_program_flow)
1618 emit_endif = false;
1619
1620 /*
1621 * A single next_insn() may change the base address of instruction store
1622 * memory(p->store), so call it first before referencing the instruction
1623 * store pointer from an index
1624 */
1625 if (emit_endif)
1626 insn = next_insn(p, BRW_OPCODE_ENDIF);
1627
1628 /* Pop the IF and (optional) ELSE instructions from the stack */
1629 p->if_depth_in_loop[p->loop_stack_depth]--;
1630 tmp = pop_if_stack(p);
1631 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1632 else_inst = tmp;
1633 tmp = pop_if_stack(p);
1634 }
1635 if_inst = tmp;
1636
1637 if (!emit_endif) {
1638 /* ENDIF is useless; don't bother emitting it. */
1639 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1640 return;
1641 }
1642
1643 if (devinfo->gen < 6) {
1644 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1645 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1646 brw_set_src1(p, insn, brw_imm_d(0x0));
1647 } else if (devinfo->gen == 6) {
1648 brw_set_dest(p, insn, brw_imm_w(0));
1649 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1650 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1651 } else if (devinfo->gen == 7) {
1652 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1653 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1654 brw_set_src1(p, insn, brw_imm_w(0));
1655 } else {
1656 brw_set_src0(p, insn, brw_imm_d(0));
1657 }
1658
1659 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1660 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1661 if (devinfo->gen < 6)
1662 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1663
1664 /* Also pop item off the stack in the endif instruction: */
1665 if (devinfo->gen < 6) {
1666 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1667 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1668 } else if (devinfo->gen == 6) {
1669 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1670 } else {
1671 brw_inst_set_jip(devinfo, insn, 2);
1672 }
1673 patch_IF_ELSE(p, if_inst, else_inst, insn);
1674 }
1675
1676 brw_inst *
1677 brw_BREAK(struct brw_codegen *p)
1678 {
1679 const struct gen_device_info *devinfo = p->devinfo;
1680 brw_inst *insn;
1681
1682 insn = next_insn(p, BRW_OPCODE_BREAK);
1683 if (devinfo->gen >= 8) {
1684 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1685 brw_set_src0(p, insn, brw_imm_d(0x0));
1686 } else if (devinfo->gen >= 6) {
1687 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1688 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1689 brw_set_src1(p, insn, brw_imm_d(0x0));
1690 } else {
1691 brw_set_dest(p, insn, brw_ip_reg());
1692 brw_set_src0(p, insn, brw_ip_reg());
1693 brw_set_src1(p, insn, brw_imm_d(0x0));
1694 brw_inst_set_gen4_pop_count(devinfo, insn,
1695 p->if_depth_in_loop[p->loop_stack_depth]);
1696 }
1697 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1698 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1699
1700 return insn;
1701 }
1702
1703 brw_inst *
1704 brw_CONT(struct brw_codegen *p)
1705 {
1706 const struct gen_device_info *devinfo = p->devinfo;
1707 brw_inst *insn;
1708
1709 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1710 brw_set_dest(p, insn, brw_ip_reg());
1711 if (devinfo->gen >= 8) {
1712 brw_set_src0(p, insn, brw_imm_d(0x0));
1713 } else {
1714 brw_set_src0(p, insn, brw_ip_reg());
1715 brw_set_src1(p, insn, brw_imm_d(0x0));
1716 }
1717
1718 if (devinfo->gen < 6) {
1719 brw_inst_set_gen4_pop_count(devinfo, insn,
1720 p->if_depth_in_loop[p->loop_stack_depth]);
1721 }
1722 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1723 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1724 return insn;
1725 }
1726
1727 brw_inst *
1728 gen6_HALT(struct brw_codegen *p)
1729 {
1730 const struct gen_device_info *devinfo = p->devinfo;
1731 brw_inst *insn;
1732
1733 insn = next_insn(p, BRW_OPCODE_HALT);
1734 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1735 if (devinfo->gen < 8) {
1736 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1737 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1738 } else if (devinfo->gen < 12) {
1739 brw_set_src0(p, insn, brw_imm_d(0x0));
1740 }
1741
1742 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1743 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1744 return insn;
1745 }
1746
1747 /* DO/WHILE loop:
1748 *
1749 * The DO/WHILE is just an unterminated loop -- break or continue are
1750 * used for control within the loop. We have a few ways they can be
1751 * done.
1752 *
1753 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1754 * jip and no DO instruction.
1755 *
1756 * For non-uniform control flow pre-gen6, there's a DO instruction to
1757 * push the mask, and a WHILE to jump back, and BREAK to get out and
1758 * pop the mask.
1759 *
1760 * For gen6, there's no more mask stack, so no need for DO. WHILE
1761 * just points back to the first instruction of the loop.
1762 */
1763 brw_inst *
1764 brw_DO(struct brw_codegen *p, unsigned execute_size)
1765 {
1766 const struct gen_device_info *devinfo = p->devinfo;
1767
1768 if (devinfo->gen >= 6 || p->single_program_flow) {
1769 push_loop_stack(p, &p->store[p->nr_insn]);
1770 return &p->store[p->nr_insn];
1771 } else {
1772 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1773
1774 push_loop_stack(p, insn);
1775
1776 /* Override the defaults for this instruction:
1777 */
1778 brw_set_dest(p, insn, brw_null_reg());
1779 brw_set_src0(p, insn, brw_null_reg());
1780 brw_set_src1(p, insn, brw_null_reg());
1781
1782 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1783 brw_inst_set_exec_size(devinfo, insn, execute_size);
1784 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1785
1786 return insn;
1787 }
1788 }
1789
1790 /**
1791 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1792 * instruction here.
1793 *
1794 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1795 * nesting, since it can always just point to the end of the block/current loop.
1796 */
1797 static void
1798 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1799 {
1800 const struct gen_device_info *devinfo = p->devinfo;
1801 brw_inst *do_inst = get_inner_do_insn(p);
1802 brw_inst *inst;
1803 unsigned br = brw_jump_scale(devinfo);
1804
1805 assert(devinfo->gen < 6);
1806
1807 for (inst = while_inst - 1; inst != do_inst; inst--) {
1808 /* If the jump count is != 0, that means that this instruction has already
1809 * been patched because it's part of a loop inside of the one we're
1810 * patching.
1811 */
1812 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1813 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1814 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1815 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1816 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1817 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1818 }
1819 }
1820 }
1821
1822 brw_inst *
1823 brw_WHILE(struct brw_codegen *p)
1824 {
1825 const struct gen_device_info *devinfo = p->devinfo;
1826 brw_inst *insn, *do_insn;
1827 unsigned br = brw_jump_scale(devinfo);
1828
1829 if (devinfo->gen >= 6) {
1830 insn = next_insn(p, BRW_OPCODE_WHILE);
1831 do_insn = get_inner_do_insn(p);
1832
1833 if (devinfo->gen >= 8) {
1834 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1835 if (devinfo->gen < 12)
1836 brw_set_src0(p, insn, brw_imm_d(0));
1837 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1838 } else if (devinfo->gen == 7) {
1839 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1840 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1841 brw_set_src1(p, insn, brw_imm_w(0));
1842 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1843 } else {
1844 brw_set_dest(p, insn, brw_imm_w(0));
1845 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1846 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1847 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1848 }
1849
1850 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1851
1852 } else {
1853 if (p->single_program_flow) {
1854 insn = next_insn(p, BRW_OPCODE_ADD);
1855 do_insn = get_inner_do_insn(p);
1856
1857 brw_set_dest(p, insn, brw_ip_reg());
1858 brw_set_src0(p, insn, brw_ip_reg());
1859 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1860 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1861 } else {
1862 insn = next_insn(p, BRW_OPCODE_WHILE);
1863 do_insn = get_inner_do_insn(p);
1864
1865 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1866
1867 brw_set_dest(p, insn, brw_ip_reg());
1868 brw_set_src0(p, insn, brw_ip_reg());
1869 brw_set_src1(p, insn, brw_imm_d(0));
1870
1871 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1872 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1873 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1874
1875 brw_patch_break_cont(p, insn);
1876 }
1877 }
1878 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1879
1880 p->loop_stack_depth--;
1881
1882 return insn;
1883 }
1884
1885 /* FORWARD JUMPS:
1886 */
1887 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1888 {
1889 const struct gen_device_info *devinfo = p->devinfo;
1890 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1891 unsigned jmpi = 1;
1892
1893 if (devinfo->gen >= 5)
1894 jmpi = 2;
1895
1896 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1897 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1898
1899 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1900 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1901 }
1902
1903 /* To integrate with the above, it makes sense that the comparison
1904 * instruction should populate the flag register. It might be simpler
1905 * just to use the flag reg for most WM tasks?
1906 */
1907 void brw_CMP(struct brw_codegen *p,
1908 struct brw_reg dest,
1909 unsigned conditional,
1910 struct brw_reg src0,
1911 struct brw_reg src1)
1912 {
1913 const struct gen_device_info *devinfo = p->devinfo;
1914 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1915
1916 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1917 brw_set_dest(p, insn, dest);
1918 brw_set_src0(p, insn, src0);
1919 brw_set_src1(p, insn, src1);
1920
1921 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1922 * page says:
1923 * "Any CMP instruction with a null destination must use a {switch}."
1924 *
1925 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1926 * mentioned on their work-arounds pages.
1927 */
1928 if (devinfo->gen == 7) {
1929 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1930 dest.nr == BRW_ARF_NULL) {
1931 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1932 }
1933 }
1934 }
1935
1936 /***********************************************************************
1937 * Helpers for the various SEND message types:
1938 */
1939
1940 /** Extended math function, float[8].
1941 */
1942 void gen4_math(struct brw_codegen *p,
1943 struct brw_reg dest,
1944 unsigned function,
1945 unsigned msg_reg_nr,
1946 struct brw_reg src,
1947 unsigned precision )
1948 {
1949 const struct gen_device_info *devinfo = p->devinfo;
1950 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1951 unsigned data_type;
1952 if (has_scalar_region(src)) {
1953 data_type = BRW_MATH_DATA_SCALAR;
1954 } else {
1955 data_type = BRW_MATH_DATA_VECTOR;
1956 }
1957
1958 assert(devinfo->gen < 6);
1959
1960 /* Example code doesn't set predicate_control for send
1961 * instructions.
1962 */
1963 brw_inst_set_pred_control(devinfo, insn, 0);
1964 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1965
1966 brw_set_dest(p, insn, dest);
1967 brw_set_src0(p, insn, src);
1968 brw_set_math_message(p,
1969 insn,
1970 function,
1971 src.type == BRW_REGISTER_TYPE_D,
1972 precision,
1973 data_type);
1974 }
1975
1976 void gen6_math(struct brw_codegen *p,
1977 struct brw_reg dest,
1978 unsigned function,
1979 struct brw_reg src0,
1980 struct brw_reg src1)
1981 {
1982 const struct gen_device_info *devinfo = p->devinfo;
1983 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1984
1985 assert(devinfo->gen >= 6);
1986
1987 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1988 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1989
1990 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1991 if (devinfo->gen == 6) {
1992 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1993 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1994 }
1995
1996 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1997 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1998 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1999 assert(src0.type != BRW_REGISTER_TYPE_F);
2000 assert(src1.type != BRW_REGISTER_TYPE_F);
2001 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2002 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2003 } else {
2004 assert(src0.type == BRW_REGISTER_TYPE_F ||
2005 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2006 assert(src1.type == BRW_REGISTER_TYPE_F ||
2007 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2008 }
2009
2010 /* Source modifiers are ignored for extended math instructions on Gen6. */
2011 if (devinfo->gen == 6) {
2012 assert(!src0.negate);
2013 assert(!src0.abs);
2014 assert(!src1.negate);
2015 assert(!src1.abs);
2016 }
2017
2018 brw_inst_set_math_function(devinfo, insn, function);
2019
2020 brw_set_dest(p, insn, dest);
2021 brw_set_src0(p, insn, src0);
2022 brw_set_src1(p, insn, src1);
2023 }
2024
2025 /**
2026 * Return the right surface index to access the thread scratch space using
2027 * stateless dataport messages.
2028 */
2029 unsigned
2030 brw_scratch_surface_idx(const struct brw_codegen *p)
2031 {
2032 /* The scratch space is thread-local so IA coherency is unnecessary. */
2033 if (p->devinfo->gen >= 8)
2034 return GEN8_BTI_STATELESS_NON_COHERENT;
2035 else
2036 return BRW_BTI_STATELESS;
2037 }
2038
2039 /**
2040 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2041 * using a constant offset per channel.
2042 *
2043 * The offset must be aligned to oword size (16 bytes). Used for
2044 * register spilling.
2045 */
2046 void brw_oword_block_write_scratch(struct brw_codegen *p,
2047 struct brw_reg mrf,
2048 int num_regs,
2049 unsigned offset)
2050 {
2051 const struct gen_device_info *devinfo = p->devinfo;
2052 const unsigned target_cache =
2053 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2054 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2055 BRW_SFID_DATAPORT_WRITE);
2056 uint32_t msg_type;
2057
2058 if (devinfo->gen >= 6)
2059 offset /= 16;
2060
2061 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2062
2063 const unsigned mlen = 1 + num_regs;
2064
2065 /* Set up the message header. This is g0, with g0.2 filled with
2066 * the offset. We don't want to leave our offset around in g0 or
2067 * it'll screw up texture samples, so set it up inside the message
2068 * reg.
2069 */
2070 {
2071 brw_push_insn_state(p);
2072 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2073 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2074 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2075
2076 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2077
2078 /* set message header global offset field (reg 0, element 2) */
2079 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2080 brw_MOV(p,
2081 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2082 mrf.nr,
2083 2), BRW_REGISTER_TYPE_UD),
2084 brw_imm_ud(offset));
2085
2086 brw_pop_insn_state(p);
2087 }
2088
2089 {
2090 struct brw_reg dest;
2091 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2092 int send_commit_msg;
2093 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2094 BRW_REGISTER_TYPE_UW);
2095
2096 brw_inst_set_sfid(devinfo, insn, target_cache);
2097 brw_inst_set_compression(devinfo, insn, false);
2098
2099 if (brw_inst_exec_size(devinfo, insn) >= 16)
2100 src_header = vec16(src_header);
2101
2102 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2103 if (devinfo->gen < 6)
2104 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2105
2106 /* Until gen6, writes followed by reads from the same location
2107 * are not guaranteed to be ordered unless write_commit is set.
2108 * If set, then a no-op write is issued to the destination
2109 * register to set a dependency, and a read from the destination
2110 * can be used to ensure the ordering.
2111 *
2112 * For gen6, only writes between different threads need ordering
2113 * protection. Our use of DP writes is all about register
2114 * spilling within a thread.
2115 */
2116 if (devinfo->gen >= 6) {
2117 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2118 send_commit_msg = 0;
2119 } else {
2120 dest = src_header;
2121 send_commit_msg = 1;
2122 }
2123
2124 brw_set_dest(p, insn, dest);
2125 if (devinfo->gen >= 6) {
2126 brw_set_src0(p, insn, mrf);
2127 } else {
2128 brw_set_src0(p, insn, brw_null_reg());
2129 }
2130
2131 if (devinfo->gen >= 6)
2132 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2133 else
2134 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2135
2136 brw_set_desc(p, insn,
2137 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2138 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2139 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2140 msg_type, 0, /* not a render target */
2141 send_commit_msg));
2142 }
2143 }
2144
2145
2146 /**
2147 * Read a block of owords (half a GRF each) from the scratch buffer
2148 * using a constant index per channel.
2149 *
2150 * Offset must be aligned to oword size (16 bytes). Used for register
2151 * spilling.
2152 */
2153 void
2154 brw_oword_block_read_scratch(struct brw_codegen *p,
2155 struct brw_reg dest,
2156 struct brw_reg mrf,
2157 int num_regs,
2158 unsigned offset)
2159 {
2160 const struct gen_device_info *devinfo = p->devinfo;
2161
2162 if (devinfo->gen >= 6)
2163 offset /= 16;
2164
2165 if (p->devinfo->gen >= 7) {
2166 /* On gen 7 and above, we no longer have message registers and we can
2167 * send from any register we want. By using the destination register
2168 * for the message, we guarantee that the implied message write won't
2169 * accidentally overwrite anything. This has been a problem because
2170 * the MRF registers and source for the final FB write are both fixed
2171 * and may overlap.
2172 */
2173 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2174 } else {
2175 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2176 }
2177 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2178
2179 const unsigned rlen = num_regs;
2180 const unsigned target_cache =
2181 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2182 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2183 BRW_SFID_DATAPORT_READ);
2184
2185 {
2186 brw_push_insn_state(p);
2187 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2188 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2189 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2190
2191 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2192
2193 /* set message header global offset field (reg 0, element 2) */
2194 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2195 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2196
2197 brw_pop_insn_state(p);
2198 }
2199
2200 {
2201 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2202
2203 brw_inst_set_sfid(devinfo, insn, target_cache);
2204 assert(brw_inst_pred_control(devinfo, insn) == 0);
2205 brw_inst_set_compression(devinfo, insn, false);
2206
2207 brw_set_dest(p, insn, dest); /* UW? */
2208 if (devinfo->gen >= 6) {
2209 brw_set_src0(p, insn, mrf);
2210 } else {
2211 brw_set_src0(p, insn, brw_null_reg());
2212 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2213 }
2214
2215 brw_set_desc(p, insn,
2216 brw_message_desc(devinfo, 1, rlen, true) |
2217 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2218 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2219 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2220 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2221 }
2222 }
2223
2224 void
2225 gen7_block_read_scratch(struct brw_codegen *p,
2226 struct brw_reg dest,
2227 int num_regs,
2228 unsigned offset)
2229 {
2230 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2231 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2232
2233 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2234
2235 /* The HW requires that the header is present; this is to get the g0.5
2236 * scratch offset.
2237 */
2238 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2239
2240 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2241 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2242 * is 32 bytes, which happens to be the size of a register.
2243 */
2244 offset /= REG_SIZE;
2245 assert(offset < (1 << 12));
2246
2247 gen7_set_dp_scratch_message(p, insn,
2248 false, /* scratch read */
2249 false, /* OWords */
2250 false, /* invalidate after read */
2251 num_regs,
2252 offset,
2253 1, /* mlen: just g0 */
2254 num_regs, /* rlen */
2255 true); /* header present */
2256 }
2257
2258 /**
2259 * Read float[4] vectors from the data port constant cache.
2260 * Location (in buffer) should be a multiple of 16.
2261 * Used for fetching shader constants.
2262 */
2263 void brw_oword_block_read(struct brw_codegen *p,
2264 struct brw_reg dest,
2265 struct brw_reg mrf,
2266 uint32_t offset,
2267 uint32_t bind_table_index)
2268 {
2269 const struct gen_device_info *devinfo = p->devinfo;
2270 const unsigned target_cache =
2271 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2272 BRW_SFID_DATAPORT_READ);
2273 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2274
2275 /* On newer hardware, offset is in units of owords. */
2276 if (devinfo->gen >= 6)
2277 offset /= 16;
2278
2279 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2280
2281 brw_push_insn_state(p);
2282 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2283 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2284 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2285
2286 brw_push_insn_state(p);
2287 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2288 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2289
2290 /* set message header global offset field (reg 0, element 2) */
2291 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2292 brw_MOV(p,
2293 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2294 mrf.nr,
2295 2), BRW_REGISTER_TYPE_UD),
2296 brw_imm_ud(offset));
2297 brw_pop_insn_state(p);
2298
2299 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2300
2301 brw_inst_set_sfid(devinfo, insn, target_cache);
2302
2303 /* cast dest to a uword[8] vector */
2304 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2305
2306 brw_set_dest(p, insn, dest);
2307 if (devinfo->gen >= 6) {
2308 brw_set_src0(p, insn, mrf);
2309 } else {
2310 brw_set_src0(p, insn, brw_null_reg());
2311 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2312 }
2313
2314 brw_set_desc(p, insn,
2315 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2316 brw_dp_read_desc(devinfo, bind_table_index,
2317 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2318 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2319 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2320
2321 brw_pop_insn_state(p);
2322 }
2323
2324 brw_inst *
2325 brw_fb_WRITE(struct brw_codegen *p,
2326 struct brw_reg payload,
2327 struct brw_reg implied_header,
2328 unsigned msg_control,
2329 unsigned binding_table_index,
2330 unsigned msg_length,
2331 unsigned response_length,
2332 bool eot,
2333 bool last_render_target,
2334 bool header_present)
2335 {
2336 const struct gen_device_info *devinfo = p->devinfo;
2337 const unsigned target_cache =
2338 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2339 BRW_SFID_DATAPORT_WRITE);
2340 brw_inst *insn;
2341 unsigned msg_type;
2342 struct brw_reg dest, src0;
2343
2344 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2345 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2346 else
2347 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2348
2349 if (devinfo->gen >= 6) {
2350 insn = next_insn(p, BRW_OPCODE_SENDC);
2351 } else {
2352 insn = next_insn(p, BRW_OPCODE_SEND);
2353 }
2354 brw_inst_set_sfid(devinfo, insn, target_cache);
2355 brw_inst_set_compression(devinfo, insn, false);
2356
2357 if (devinfo->gen >= 6) {
2358 /* headerless version, just submit color payload */
2359 src0 = payload;
2360
2361 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2362 } else {
2363 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2364 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2365 src0 = implied_header;
2366
2367 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2368 }
2369
2370 brw_set_dest(p, insn, dest);
2371 brw_set_src0(p, insn, src0);
2372 brw_set_desc(p, insn,
2373 brw_message_desc(devinfo, msg_length, response_length,
2374 header_present) |
2375 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2376 msg_type, last_render_target,
2377 0 /* send_commit_msg */));
2378 brw_inst_set_eot(devinfo, insn, eot);
2379
2380 return insn;
2381 }
2382
2383 brw_inst *
2384 gen9_fb_READ(struct brw_codegen *p,
2385 struct brw_reg dst,
2386 struct brw_reg payload,
2387 unsigned binding_table_index,
2388 unsigned msg_length,
2389 unsigned response_length,
2390 bool per_sample)
2391 {
2392 const struct gen_device_info *devinfo = p->devinfo;
2393 assert(devinfo->gen >= 9);
2394 const unsigned msg_subtype =
2395 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2396 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2397
2398 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2399 brw_set_dest(p, insn, dst);
2400 brw_set_src0(p, insn, payload);
2401 brw_set_desc(
2402 p, insn,
2403 brw_message_desc(devinfo, msg_length, response_length, true) |
2404 brw_dp_read_desc(devinfo, binding_table_index,
2405 per_sample << 5 | msg_subtype,
2406 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2407 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2408 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2409
2410 return insn;
2411 }
2412
2413 /**
2414 * Texture sample instruction.
2415 * Note: the msg_type plus msg_length values determine exactly what kind
2416 * of sampling operation is performed. See volume 4, page 161 of docs.
2417 */
2418 void brw_SAMPLE(struct brw_codegen *p,
2419 struct brw_reg dest,
2420 unsigned msg_reg_nr,
2421 struct brw_reg src0,
2422 unsigned binding_table_index,
2423 unsigned sampler,
2424 unsigned msg_type,
2425 unsigned response_length,
2426 unsigned msg_length,
2427 unsigned header_present,
2428 unsigned simd_mode,
2429 unsigned return_format)
2430 {
2431 const struct gen_device_info *devinfo = p->devinfo;
2432 brw_inst *insn;
2433
2434 if (msg_reg_nr != -1)
2435 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2436
2437 insn = next_insn(p, BRW_OPCODE_SEND);
2438 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2439 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2440
2441 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2442 *
2443 * "Instruction compression is not allowed for this instruction (that
2444 * is, send). The hardware behavior is undefined if this instruction is
2445 * set as compressed. However, compress control can be set to "SecHalf"
2446 * to affect the EMask generation."
2447 *
2448 * No similar wording is found in later PRMs, but there are examples
2449 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2450 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2451 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2452 */
2453 brw_inst_set_compression(devinfo, insn, false);
2454
2455 if (devinfo->gen < 6)
2456 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2457
2458 brw_set_dest(p, insn, dest);
2459 brw_set_src0(p, insn, src0);
2460 brw_set_desc(p, insn,
2461 brw_message_desc(devinfo, msg_length, response_length,
2462 header_present) |
2463 brw_sampler_desc(devinfo, binding_table_index, sampler,
2464 msg_type, simd_mode, return_format));
2465 }
2466
2467 /* Adjust the message header's sampler state pointer to
2468 * select the correct group of 16 samplers.
2469 */
2470 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2471 struct brw_reg header,
2472 struct brw_reg sampler_index)
2473 {
2474 /* The "Sampler Index" field can only store values between 0 and 15.
2475 * However, we can add an offset to the "Sampler State Pointer"
2476 * field, effectively selecting a different set of 16 samplers.
2477 *
2478 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2479 * offset, and each sampler state is only 16-bytes, so we can't
2480 * exclusively use the offset - we have to use both.
2481 */
2482
2483 const struct gen_device_info *devinfo = p->devinfo;
2484
2485 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2486 const int sampler_state_size = 16; /* 16 bytes */
2487 uint32_t sampler = sampler_index.ud;
2488
2489 if (sampler >= 16) {
2490 assert(devinfo->is_haswell || devinfo->gen >= 8);
2491 brw_ADD(p,
2492 get_element_ud(header, 3),
2493 get_element_ud(brw_vec8_grf(0, 0), 3),
2494 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2495 }
2496 } else {
2497 /* Non-const sampler array indexing case */
2498 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2499 return;
2500 }
2501
2502 struct brw_reg temp = get_element_ud(header, 3);
2503
2504 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2505 brw_SHL(p, temp, temp, brw_imm_ud(4));
2506 brw_ADD(p,
2507 get_element_ud(header, 3),
2508 get_element_ud(brw_vec8_grf(0, 0), 3),
2509 temp);
2510 }
2511 }
2512
2513 /* All these variables are pretty confusing - we might be better off
2514 * using bitmasks and macros for this, in the old style. Or perhaps
2515 * just having the caller instantiate the fields in dword3 itself.
2516 */
2517 void brw_urb_WRITE(struct brw_codegen *p,
2518 struct brw_reg dest,
2519 unsigned msg_reg_nr,
2520 struct brw_reg src0,
2521 enum brw_urb_write_flags flags,
2522 unsigned msg_length,
2523 unsigned response_length,
2524 unsigned offset,
2525 unsigned swizzle)
2526 {
2527 const struct gen_device_info *devinfo = p->devinfo;
2528 brw_inst *insn;
2529
2530 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2531
2532 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2533 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2534 brw_push_insn_state(p);
2535 brw_set_default_access_mode(p, BRW_ALIGN_1);
2536 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2537 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2538 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2539 BRW_REGISTER_TYPE_UD),
2540 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2541 brw_imm_ud(0xff00));
2542 brw_pop_insn_state(p);
2543 }
2544
2545 insn = next_insn(p, BRW_OPCODE_SEND);
2546
2547 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2548
2549 brw_set_dest(p, insn, dest);
2550 brw_set_src0(p, insn, src0);
2551 brw_set_src1(p, insn, brw_imm_d(0));
2552
2553 if (devinfo->gen < 6)
2554 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2555
2556 brw_set_urb_message(p,
2557 insn,
2558 flags,
2559 msg_length,
2560 response_length,
2561 offset,
2562 swizzle);
2563 }
2564
2565 void
2566 brw_send_indirect_message(struct brw_codegen *p,
2567 unsigned sfid,
2568 struct brw_reg dst,
2569 struct brw_reg payload,
2570 struct brw_reg desc,
2571 unsigned desc_imm,
2572 bool eot)
2573 {
2574 const struct gen_device_info *devinfo = p->devinfo;
2575 struct brw_inst *send;
2576
2577 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2578
2579 assert(desc.type == BRW_REGISTER_TYPE_UD);
2580
2581 if (desc.file == BRW_IMMEDIATE_VALUE) {
2582 send = next_insn(p, BRW_OPCODE_SEND);
2583 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2584 brw_set_desc(p, send, desc.ud | desc_imm);
2585 } else {
2586 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2587
2588 brw_push_insn_state(p);
2589 brw_set_default_access_mode(p, BRW_ALIGN_1);
2590 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2591 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2592 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2593
2594 /* Load the indirect descriptor to an address register using OR so the
2595 * caller can specify additional descriptor bits with the desc_imm
2596 * immediate.
2597 */
2598 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2599
2600 brw_pop_insn_state(p);
2601
2602 send = next_insn(p, BRW_OPCODE_SEND);
2603 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2604
2605 if (devinfo->gen >= 12)
2606 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2607 else
2608 brw_set_src1(p, send, addr);
2609 }
2610
2611 brw_set_dest(p, send, dst);
2612 brw_inst_set_sfid(devinfo, send, sfid);
2613 brw_inst_set_eot(devinfo, send, eot);
2614 }
2615
2616 void
2617 brw_send_indirect_split_message(struct brw_codegen *p,
2618 unsigned sfid,
2619 struct brw_reg dst,
2620 struct brw_reg payload0,
2621 struct brw_reg payload1,
2622 struct brw_reg desc,
2623 unsigned desc_imm,
2624 struct brw_reg ex_desc,
2625 unsigned ex_desc_imm,
2626 bool eot)
2627 {
2628 const struct gen_device_info *devinfo = p->devinfo;
2629 struct brw_inst *send;
2630
2631 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2632
2633 assert(desc.type == BRW_REGISTER_TYPE_UD);
2634
2635 if (desc.file == BRW_IMMEDIATE_VALUE) {
2636 desc.ud |= desc_imm;
2637 } else {
2638 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2639
2640 brw_push_insn_state(p);
2641 brw_set_default_access_mode(p, BRW_ALIGN_1);
2642 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2643 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2644 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2645
2646 /* Load the indirect descriptor to an address register using OR so the
2647 * caller can specify additional descriptor bits with the desc_imm
2648 * immediate.
2649 */
2650 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2651
2652 brw_pop_insn_state(p);
2653 desc = addr;
2654 }
2655
2656 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2657 (ex_desc.ud & INTEL_MASK(15, 12)) == 0) {
2658 ex_desc.ud |= ex_desc_imm;
2659 } else {
2660 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2661
2662 brw_push_insn_state(p);
2663 brw_set_default_access_mode(p, BRW_ALIGN_1);
2664 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2665 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2666 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2667
2668 /* Load the indirect extended descriptor to an address register using OR
2669 * so the caller can specify additional descriptor bits with the
2670 * desc_imm immediate.
2671 *
2672 * Even though the instruction dispatcher always pulls the SFID and EOT
2673 * fields from the instruction itself, actual external unit which
2674 * processes the message gets the SFID and EOT from the extended
2675 * descriptor which comes from the address register. If we don't OR
2676 * those two bits in, the external unit may get confused and hang.
2677 */
2678 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2679
2680 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2681 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2682 * we may have fallen back to an indirect extended descriptor.
2683 */
2684 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2685 } else {
2686 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2687 }
2688
2689 brw_pop_insn_state(p);
2690 ex_desc = addr;
2691 }
2692
2693 send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2694 brw_set_dest(p, send, dst);
2695 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2696 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2697
2698 if (desc.file == BRW_IMMEDIATE_VALUE) {
2699 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2700 brw_inst_set_send_desc(devinfo, send, desc.ud);
2701 } else {
2702 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2703 assert(desc.nr == BRW_ARF_ADDRESS);
2704 assert(desc.subnr == 0);
2705 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2706 }
2707
2708 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2709 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2710 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2711 } else {
2712 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2713 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2714 assert((ex_desc.subnr & 0x3) == 0);
2715 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2716 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2717 }
2718
2719 brw_inst_set_sfid(devinfo, send, sfid);
2720 brw_inst_set_eot(devinfo, send, eot);
2721 }
2722
2723 static void
2724 brw_send_indirect_surface_message(struct brw_codegen *p,
2725 unsigned sfid,
2726 struct brw_reg dst,
2727 struct brw_reg payload,
2728 struct brw_reg surface,
2729 unsigned desc_imm)
2730 {
2731 if (surface.file != BRW_IMMEDIATE_VALUE) {
2732 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2733
2734 brw_push_insn_state(p);
2735 brw_set_default_access_mode(p, BRW_ALIGN_1);
2736 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2737 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2738 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2739
2740 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2741 * some surface array is accessed out of bounds.
2742 */
2743 brw_AND(p, addr,
2744 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2745 BRW_GET_SWZ(surface.swizzle, 0)),
2746 brw_imm_ud(0xff));
2747
2748 brw_pop_insn_state(p);
2749
2750 surface = addr;
2751 }
2752
2753 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2754 }
2755
2756 static bool
2757 while_jumps_before_offset(const struct gen_device_info *devinfo,
2758 brw_inst *insn, int while_offset, int start_offset)
2759 {
2760 int scale = 16 / brw_jump_scale(devinfo);
2761 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2762 : brw_inst_jip(devinfo, insn);
2763 assert(jip < 0);
2764 return while_offset + jip * scale <= start_offset;
2765 }
2766
2767
2768 static int
2769 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2770 {
2771 int offset;
2772 void *store = p->store;
2773 const struct gen_device_info *devinfo = p->devinfo;
2774
2775 int depth = 0;
2776
2777 for (offset = next_offset(devinfo, store, start_offset);
2778 offset < p->next_insn_offset;
2779 offset = next_offset(devinfo, store, offset)) {
2780 brw_inst *insn = store + offset;
2781
2782 switch (brw_inst_opcode(devinfo, insn)) {
2783 case BRW_OPCODE_IF:
2784 depth++;
2785 break;
2786 case BRW_OPCODE_ENDIF:
2787 if (depth == 0)
2788 return offset;
2789 depth--;
2790 break;
2791 case BRW_OPCODE_WHILE:
2792 /* If the while doesn't jump before our instruction, it's the end
2793 * of a sibling do...while loop. Ignore it.
2794 */
2795 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2796 continue;
2797 /* fallthrough */
2798 case BRW_OPCODE_ELSE:
2799 case BRW_OPCODE_HALT:
2800 if (depth == 0)
2801 return offset;
2802 default:
2803 break;
2804 }
2805 }
2806
2807 return 0;
2808 }
2809
2810 /* There is no DO instruction on gen6, so to find the end of the loop
2811 * we have to see if the loop is jumping back before our start
2812 * instruction.
2813 */
2814 static int
2815 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2816 {
2817 const struct gen_device_info *devinfo = p->devinfo;
2818 int offset;
2819 void *store = p->store;
2820
2821 assert(devinfo->gen >= 6);
2822
2823 /* Always start after the instruction (such as a WHILE) we're trying to fix
2824 * up.
2825 */
2826 for (offset = next_offset(devinfo, store, start_offset);
2827 offset < p->next_insn_offset;
2828 offset = next_offset(devinfo, store, offset)) {
2829 brw_inst *insn = store + offset;
2830
2831 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2832 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2833 return offset;
2834 }
2835 }
2836 assert(!"not reached");
2837 return start_offset;
2838 }
2839
2840 /* After program generation, go back and update the UIP and JIP of
2841 * BREAK, CONT, and HALT instructions to their correct locations.
2842 */
2843 void
2844 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2845 {
2846 const struct gen_device_info *devinfo = p->devinfo;
2847 int offset;
2848 int br = brw_jump_scale(devinfo);
2849 int scale = 16 / br;
2850 void *store = p->store;
2851
2852 if (devinfo->gen < 6)
2853 return;
2854
2855 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2856 brw_inst *insn = store + offset;
2857 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2858
2859 int block_end_offset = brw_find_next_block_end(p, offset);
2860 switch (brw_inst_opcode(devinfo, insn)) {
2861 case BRW_OPCODE_BREAK:
2862 assert(block_end_offset != 0);
2863 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2864 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2865 brw_inst_set_uip(devinfo, insn,
2866 (brw_find_loop_end(p, offset) - offset +
2867 (devinfo->gen == 6 ? 16 : 0)) / scale);
2868 break;
2869 case BRW_OPCODE_CONTINUE:
2870 assert(block_end_offset != 0);
2871 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2872 brw_inst_set_uip(devinfo, insn,
2873 (brw_find_loop_end(p, offset) - offset) / scale);
2874
2875 assert(brw_inst_uip(devinfo, insn) != 0);
2876 assert(brw_inst_jip(devinfo, insn) != 0);
2877 break;
2878
2879 case BRW_OPCODE_ENDIF: {
2880 int32_t jump = (block_end_offset == 0) ?
2881 1 * br : (block_end_offset - offset) / scale;
2882 if (devinfo->gen >= 7)
2883 brw_inst_set_jip(devinfo, insn, jump);
2884 else
2885 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2886 break;
2887 }
2888
2889 case BRW_OPCODE_HALT:
2890 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2891 *
2892 * "In case of the halt instruction not inside any conditional
2893 * code block, the value of <JIP> and <UIP> should be the
2894 * same. In case of the halt instruction inside conditional code
2895 * block, the <UIP> should be the end of the program, and the
2896 * <JIP> should be end of the most inner conditional code block."
2897 *
2898 * The uip will have already been set by whoever set up the
2899 * instruction.
2900 */
2901 if (block_end_offset == 0) {
2902 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2903 } else {
2904 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2905 }
2906 assert(brw_inst_uip(devinfo, insn) != 0);
2907 assert(brw_inst_jip(devinfo, insn) != 0);
2908 break;
2909
2910 default:
2911 break;
2912 }
2913 }
2914 }
2915
2916 void brw_ff_sync(struct brw_codegen *p,
2917 struct brw_reg dest,
2918 unsigned msg_reg_nr,
2919 struct brw_reg src0,
2920 bool allocate,
2921 unsigned response_length,
2922 bool eot)
2923 {
2924 const struct gen_device_info *devinfo = p->devinfo;
2925 brw_inst *insn;
2926
2927 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2928
2929 insn = next_insn(p, BRW_OPCODE_SEND);
2930 brw_set_dest(p, insn, dest);
2931 brw_set_src0(p, insn, src0);
2932 brw_set_src1(p, insn, brw_imm_d(0));
2933
2934 if (devinfo->gen < 6)
2935 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2936
2937 brw_set_ff_sync_message(p,
2938 insn,
2939 allocate,
2940 response_length,
2941 eot);
2942 }
2943
2944 /**
2945 * Emit the SEND instruction necessary to generate stream output data on Gen6
2946 * (for transform feedback).
2947 *
2948 * If send_commit_msg is true, this is the last piece of stream output data
2949 * from this thread, so send the data as a committed write. According to the
2950 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2951 *
2952 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2953 * writes are complete by sending the final write as a committed write."
2954 */
2955 void
2956 brw_svb_write(struct brw_codegen *p,
2957 struct brw_reg dest,
2958 unsigned msg_reg_nr,
2959 struct brw_reg src0,
2960 unsigned binding_table_index,
2961 bool send_commit_msg)
2962 {
2963 const struct gen_device_info *devinfo = p->devinfo;
2964 const unsigned target_cache =
2965 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2966 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2967 BRW_SFID_DATAPORT_WRITE);
2968 brw_inst *insn;
2969
2970 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2971
2972 insn = next_insn(p, BRW_OPCODE_SEND);
2973 brw_inst_set_sfid(devinfo, insn, target_cache);
2974 brw_set_dest(p, insn, dest);
2975 brw_set_src0(p, insn, src0);
2976 brw_set_desc(p, insn,
2977 brw_message_desc(devinfo, 1, send_commit_msg, true) |
2978 brw_dp_write_desc(devinfo, binding_table_index,
2979 0, /* msg_control: ignored */
2980 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2981 0, /* last_render_target: ignored */
2982 send_commit_msg)); /* send_commit_msg */
2983 }
2984
2985 static unsigned
2986 brw_surface_payload_size(struct brw_codegen *p,
2987 unsigned num_channels,
2988 unsigned exec_size /**< 0 for SIMD4x2 */)
2989 {
2990 if (exec_size == 0)
2991 return 1; /* SIMD4x2 */
2992 else if (exec_size <= 8)
2993 return num_channels;
2994 else
2995 return 2 * num_channels;
2996 }
2997
2998 void
2999 brw_untyped_atomic(struct brw_codegen *p,
3000 struct brw_reg dst,
3001 struct brw_reg payload,
3002 struct brw_reg surface,
3003 unsigned atomic_op,
3004 unsigned msg_length,
3005 bool response_expected,
3006 bool header_present)
3007 {
3008 const struct gen_device_info *devinfo = p->devinfo;
3009 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3010 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3011 GEN7_SFID_DATAPORT_DATA_CACHE);
3012 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3013 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3014 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3015 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3016 has_simd4x2 ? 0 : 8;
3017 const unsigned response_length =
3018 brw_surface_payload_size(p, response_expected, exec_size);
3019 const unsigned desc =
3020 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3021 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3022 response_expected);
3023 /* Mask out unused components -- This is especially important in Align16
3024 * mode on generations that don't have native support for SIMD4x2 atomics,
3025 * because unused but enabled components will cause the dataport to perform
3026 * additional atomic operations on the addresses that happen to be in the
3027 * uninitialized Y, Z and W coordinates of the payload.
3028 */
3029 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3030
3031 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3032 payload, surface, desc);
3033 }
3034
3035 void
3036 brw_untyped_surface_read(struct brw_codegen *p,
3037 struct brw_reg dst,
3038 struct brw_reg payload,
3039 struct brw_reg surface,
3040 unsigned msg_length,
3041 unsigned num_channels)
3042 {
3043 const struct gen_device_info *devinfo = p->devinfo;
3044 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3045 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3046 GEN7_SFID_DATAPORT_DATA_CACHE);
3047 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3048 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3049 const unsigned response_length =
3050 brw_surface_payload_size(p, num_channels, exec_size);
3051 const unsigned desc =
3052 brw_message_desc(devinfo, msg_length, response_length, false) |
3053 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3054
3055 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3056 }
3057
3058 void
3059 brw_untyped_surface_write(struct brw_codegen *p,
3060 struct brw_reg payload,
3061 struct brw_reg surface,
3062 unsigned msg_length,
3063 unsigned num_channels,
3064 bool header_present)
3065 {
3066 const struct gen_device_info *devinfo = p->devinfo;
3067 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3068 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3069 GEN7_SFID_DATAPORT_DATA_CACHE);
3070 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3071 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3072 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3073 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3074 has_simd4x2 ? 0 : 8;
3075 const unsigned desc =
3076 brw_message_desc(devinfo, msg_length, 0, header_present) |
3077 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3078 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3079 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3080
3081 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3082 payload, surface, desc);
3083 }
3084
3085 static void
3086 brw_set_memory_fence_message(struct brw_codegen *p,
3087 struct brw_inst *insn,
3088 enum brw_message_target sfid,
3089 bool commit_enable,
3090 unsigned bti)
3091 {
3092 const struct gen_device_info *devinfo = p->devinfo;
3093
3094 brw_set_desc(p, insn, brw_message_desc(
3095 devinfo, 1, (commit_enable ? 1 : 0), true));
3096
3097 brw_inst_set_sfid(devinfo, insn, sfid);
3098
3099 switch (sfid) {
3100 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3101 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3102 break;
3103 case GEN7_SFID_DATAPORT_DATA_CACHE:
3104 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3105 break;
3106 default:
3107 unreachable("Not reached");
3108 }
3109
3110 if (commit_enable)
3111 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3112
3113 assert(devinfo->gen >= 11 || bti == 0);
3114 brw_inst_set_binding_table_index(devinfo, insn, bti);
3115 }
3116
3117 void
3118 brw_memory_fence(struct brw_codegen *p,
3119 struct brw_reg dst,
3120 struct brw_reg src,
3121 enum opcode send_op,
3122 bool stall,
3123 unsigned bti)
3124 {
3125 const struct gen_device_info *devinfo = p->devinfo;
3126 const bool commit_enable = stall ||
3127 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3128 (devinfo->gen == 7 && !devinfo->is_haswell);
3129 struct brw_inst *insn;
3130
3131 brw_push_insn_state(p);
3132 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3133 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3134 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3135 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3136
3137 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3138 * message doesn't write anything back.
3139 */
3140 insn = next_insn(p, send_op);
3141 brw_set_dest(p, insn, dst);
3142 brw_set_src0(p, insn, src);
3143 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3144 commit_enable, bti);
3145
3146 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3147 /* IVB does typed surface access through the render cache, so we need to
3148 * flush it too. Use a different register so both flushes can be
3149 * pipelined by the hardware.
3150 */
3151 insn = next_insn(p, send_op);
3152 brw_set_dest(p, insn, offset(dst, 1));
3153 brw_set_src0(p, insn, src);
3154 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3155 commit_enable, bti);
3156
3157 /* Now write the response of the second message into the response of the
3158 * first to trigger a pipeline stall -- This way future render and data
3159 * cache messages will be properly ordered with respect to past data and
3160 * render cache messages.
3161 */
3162 brw_MOV(p, dst, offset(dst, 1));
3163 }
3164
3165 if (stall)
3166 brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
3167
3168 brw_pop_insn_state(p);
3169 }
3170
3171 void
3172 brw_pixel_interpolator_query(struct brw_codegen *p,
3173 struct brw_reg dest,
3174 struct brw_reg mrf,
3175 bool noperspective,
3176 unsigned mode,
3177 struct brw_reg data,
3178 unsigned msg_length,
3179 unsigned response_length)
3180 {
3181 const struct gen_device_info *devinfo = p->devinfo;
3182 const uint16_t exec_size = brw_get_default_exec_size(p);
3183 const unsigned slot_group = brw_get_default_group(p) / 16;
3184 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3185 const unsigned desc =
3186 brw_message_desc(devinfo, msg_length, response_length, false) |
3187 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3188 slot_group);
3189
3190 /* brw_send_indirect_message will automatically use a direct send message
3191 * if data is actually immediate.
3192 */
3193 brw_send_indirect_message(p,
3194 GEN7_SFID_PIXEL_INTERPOLATOR,
3195 dest,
3196 mrf,
3197 vec1(data),
3198 desc,
3199 false);
3200 }
3201
3202 void
3203 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3204 struct brw_reg mask)
3205 {
3206 const struct gen_device_info *devinfo = p->devinfo;
3207 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3208 const unsigned qtr_control = brw_get_default_group(p) / 8;
3209 brw_inst *inst;
3210
3211 assert(devinfo->gen >= 7);
3212 assert(mask.type == BRW_REGISTER_TYPE_UD);
3213
3214 brw_push_insn_state(p);
3215
3216 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3217 * unnecessary bits in the instruction words, get the information we need
3218 * and reset the default flag register. This allows more instructions to be
3219 * compacted.
3220 */
3221 const unsigned flag_subreg = p->current->flag_subreg;
3222 brw_set_default_flag_reg(p, 0, 0);
3223
3224 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3225 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3226
3227 if (devinfo->gen >= 8) {
3228 /* Getting the first active channel index is easy on Gen8: Just find
3229 * the first bit set in the execution mask. The register exists on
3230 * HSW already but it reads back as all ones when the current
3231 * instruction has execution masking disabled, so it's kind of
3232 * useless.
3233 */
3234 struct brw_reg exec_mask =
3235 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3236
3237 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3238 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3239 /* Unfortunately, ce0 does not take into account the thread
3240 * dispatch mask, which may be a problem in cases where it's not
3241 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3242 * some n). Combine ce0 with the given dispatch (or vector) mask
3243 * to mask off those channels which were never dispatched by the
3244 * hardware.
3245 */
3246 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3247 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3248 exec_mask = vec1(dst);
3249 }
3250
3251 /* Quarter control has the effect of magically shifting the value of
3252 * ce0 so you'll get the first active channel relative to the
3253 * specified quarter control as result.
3254 */
3255 inst = brw_FBL(p, vec1(dst), exec_mask);
3256 } else {
3257 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3258
3259 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3260 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3261
3262 /* Run enough instructions returning zero with execution masking and
3263 * a conditional modifier enabled in order to get the full execution
3264 * mask in f1.0. We could use a single 32-wide move here if it
3265 * weren't because of the hardware bug that causes channel enables to
3266 * be applied incorrectly to the second half of 32-wide instructions
3267 * on Gen7.
3268 */
3269 const unsigned lower_size = MIN2(16, exec_size);
3270 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3271 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3272 brw_imm_uw(0));
3273 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3274 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3275 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3276 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3277 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3278 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3279 }
3280
3281 /* Find the first bit set in the exec_size-wide portion of the flag
3282 * register that was updated by the last sequence of MOV
3283 * instructions.
3284 */
3285 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3286 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3287 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3288 }
3289 } else {
3290 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3291
3292 if (devinfo->gen >= 8 &&
3293 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3294 /* In SIMD4x2 mode the first active channel index is just the
3295 * negation of the first bit of the mask register. Note that ce0
3296 * doesn't take into account the dispatch mask, so the Gen7 path
3297 * should be used instead unless you have the guarantee that the
3298 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3299 * for some n).
3300 */
3301 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3302 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3303 brw_imm_ud(1));
3304
3305 } else {
3306 /* Overwrite the destination without and with execution masking to
3307 * find out which of the channels is active.
3308 */
3309 brw_push_insn_state(p);
3310 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3311 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3312 brw_imm_ud(1));
3313
3314 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3315 brw_imm_ud(0));
3316 brw_pop_insn_state(p);
3317 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3318 }
3319 }
3320
3321 brw_pop_insn_state(p);
3322 }
3323
3324 void
3325 brw_broadcast(struct brw_codegen *p,
3326 struct brw_reg dst,
3327 struct brw_reg src,
3328 struct brw_reg idx)
3329 {
3330 const struct gen_device_info *devinfo = p->devinfo;
3331 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3332 brw_inst *inst;
3333
3334 brw_push_insn_state(p);
3335 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3336 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3337
3338 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3339 src.address_mode == BRW_ADDRESS_DIRECT);
3340 assert(!src.abs && !src.negate);
3341 assert(src.type == dst.type);
3342
3343 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3344 idx.file == BRW_IMMEDIATE_VALUE) {
3345 /* Trivial, the source is already uniform or the index is a constant.
3346 * We will typically not get here if the optimizer is doing its job, but
3347 * asserting would be mean.
3348 */
3349 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3350 brw_MOV(p, dst,
3351 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3352 stride(suboffset(src, 4 * i), 0, 4, 1)));
3353 } else {
3354 /* From the Haswell PRM section "Register Region Restrictions":
3355 *
3356 * "The lower bits of the AddressImmediate must not overflow to
3357 * change the register address. The lower 5 bits of Address
3358 * Immediate when added to lower 5 bits of address register gives
3359 * the sub-register offset. The upper bits of Address Immediate
3360 * when added to upper bits of address register gives the register
3361 * address. Any overflow from sub-register offset is dropped."
3362 *
3363 * Fortunately, for broadcast, we never have a sub-register offset so
3364 * this isn't an issue.
3365 */
3366 assert(src.subnr == 0);
3367
3368 if (align1) {
3369 const struct brw_reg addr =
3370 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3371 unsigned offset = src.nr * REG_SIZE + src.subnr;
3372 /* Limit in bytes of the signed indirect addressing immediate. */
3373 const unsigned limit = 512;
3374
3375 brw_push_insn_state(p);
3376 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3377 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3378
3379 /* Take into account the component size and horizontal stride. */
3380 assert(src.vstride == src.hstride + src.width);
3381 brw_SHL(p, addr, vec1(idx),
3382 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3383 src.hstride - 1));
3384
3385 /* We can only address up to limit bytes using the indirect
3386 * addressing immediate, account for the difference if the source
3387 * register is above this limit.
3388 */
3389 if (offset >= limit) {
3390 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3391 offset = offset % limit;
3392 }
3393
3394 brw_pop_insn_state(p);
3395
3396 /* Use indirect addressing to fetch the specified component. */
3397 if (type_sz(src.type) > 4 &&
3398 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3399 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3400 *
3401 * "When source or destination datatype is 64b or operation is
3402 * integer DWord multiply, indirect addressing must not be
3403 * used."
3404 *
3405 * To work around both of this issue, we do two integer MOVs
3406 * insead of one 64-bit MOV. Because no double value should ever
3407 * cross a register boundary, it's safe to use the immediate
3408 * offset in the indirect here to handle adding 4 bytes to the
3409 * offset and avoid the extra ADD to the register file.
3410 */
3411 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3412 retype(brw_vec1_indirect(addr.subnr, offset),
3413 BRW_REGISTER_TYPE_D));
3414 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3415 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3416 BRW_REGISTER_TYPE_D));
3417 } else {
3418 brw_MOV(p, dst,
3419 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3420 }
3421 } else {
3422 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3423 * to all bits of a flag register,
3424 */
3425 inst = brw_MOV(p,
3426 brw_null_reg(),
3427 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3428 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3429 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3430 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3431
3432 /* and use predicated SEL to pick the right channel. */
3433 inst = brw_SEL(p, dst,
3434 stride(suboffset(src, 4), 4, 4, 1),
3435 stride(src, 4, 4, 1));
3436 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3437 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3438 }
3439 }
3440
3441 brw_pop_insn_state(p);
3442 }
3443
3444 /**
3445 * This instruction is generated as a single-channel align1 instruction by
3446 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3447 *
3448 * We can't use the typed atomic op in the FS because that has the execution
3449 * mask ANDed with the pixel mask, but we just want to write the one dword for
3450 * all the pixels.
3451 *
3452 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3453 * one u32. So we use the same untyped atomic write message as the pixel
3454 * shader.
3455 *
3456 * The untyped atomic operation requires a BUFFER surface type with RAW
3457 * format, and is only accessible through the legacy DATA_CACHE dataport
3458 * messages.
3459 */
3460 void brw_shader_time_add(struct brw_codegen *p,
3461 struct brw_reg payload,
3462 uint32_t surf_index)
3463 {
3464 const struct gen_device_info *devinfo = p->devinfo;
3465 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3466 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3467 GEN7_SFID_DATAPORT_DATA_CACHE);
3468 assert(devinfo->gen >= 7);
3469
3470 brw_push_insn_state(p);
3471 brw_set_default_access_mode(p, BRW_ALIGN_1);
3472 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3473 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3474 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3475
3476 /* We use brw_vec1_reg and unmasked because we want to increment the given
3477 * offset only once.
3478 */
3479 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3480 BRW_ARF_NULL, 0));
3481 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3482 payload.nr, 0));
3483 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3484 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3485 false)));
3486
3487 brw_inst_set_sfid(devinfo, send, sfid);
3488 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3489
3490 brw_pop_insn_state(p);
3491 }
3492
3493
3494 /**
3495 * Emit the SEND message for a barrier
3496 */
3497 void
3498 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3499 {
3500 const struct gen_device_info *devinfo = p->devinfo;
3501 struct brw_inst *inst;
3502
3503 assert(devinfo->gen >= 7);
3504
3505 brw_push_insn_state(p);
3506 brw_set_default_access_mode(p, BRW_ALIGN_1);
3507 inst = next_insn(p, BRW_OPCODE_SEND);
3508 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3509 brw_set_src0(p, inst, src);
3510 brw_set_src1(p, inst, brw_null_reg());
3511 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3512
3513 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3514 brw_inst_set_gateway_notify(devinfo, inst, 1);
3515 brw_inst_set_gateway_subfuncid(devinfo, inst,
3516 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3517
3518 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3519 brw_pop_insn_state(p);
3520 }
3521
3522
3523 /**
3524 * Emit the wait instruction for a barrier
3525 */
3526 void
3527 brw_WAIT(struct brw_codegen *p)
3528 {
3529 const struct gen_device_info *devinfo = p->devinfo;
3530 struct brw_inst *insn;
3531
3532 struct brw_reg src = brw_notification_reg();
3533
3534 insn = next_insn(p, BRW_OPCODE_WAIT);
3535 brw_set_dest(p, insn, src);
3536 brw_set_src0(p, insn, src);
3537 brw_set_src1(p, insn, brw_null_reg());
3538
3539 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3540 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3541 }
3542
3543 void
3544 brw_float_controls_mode(struct brw_codegen *p,
3545 unsigned mode, unsigned mask)
3546 {
3547 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3548 brw_imm_ud(~mask));
3549 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3550
3551 /* From the Skylake PRM, Volume 7, page 760:
3552 * "Implementation Restriction on Register Access: When the control
3553 * register is used as an explicit source and/or destination, hardware
3554 * does not ensure execution pipeline coherency. Software must set the
3555 * thread control field to ‘switch’ for an instruction that uses
3556 * control register as an explicit operand."
3557 */
3558 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3559
3560 if (mode) {
3561 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3562 brw_imm_ud(mode));
3563 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3564 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3565 }
3566 }