intel/eu/gen12: Codegen SEND descriptor regions correctly.
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 /* The hardware has a restriction where if the destination is Byte,
98 * the instruction needs to have a stride of 2 (except for packed byte
99 * MOV). This seems to be required even if the destination is the NULL
100 * register.
101 */
102 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
103 dest.nr == BRW_ARF_NULL &&
104 type_sz(dest.type) == 1) {
105 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
106 }
107
108 gen7_convert_mrf_to_grf(p, &dest);
109
110 if (devinfo->gen >= 12 &&
111 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
112 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
113 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
115 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
116 assert(dest.subnr == 0);
117 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
118 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
119 dest.vstride == dest.width + 1));
120 assert(!dest.negate && !dest.abs);
121 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
122 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
123
124 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
125 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
126 assert(devinfo->gen < 12);
127 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
128 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
129 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
130 assert(dest.subnr % 16 == 0);
131 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
132 dest.vstride == dest.width + 1);
133 assert(!dest.negate && !dest.abs);
134 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
135 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
136 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
137 } else {
138 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
139 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
140
141 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
142 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
143
144 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
145 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
146 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
147 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
148 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
149 } else {
150 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
151 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
152 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
153 dest.file == BRW_MESSAGE_REGISTER_FILE) {
154 assert(dest.writemask != 0);
155 }
156 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
157 * Although Dst.HorzStride is a don't care for Align16, HW needs
158 * this to be programmed as "01".
159 */
160 brw_inst_set_dst_hstride(devinfo, inst, 1);
161 }
162 } else {
163 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
164
165 /* These are different sizes in align1 vs align16:
166 */
167 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
168 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
169 dest.indirect_offset);
170 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
171 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
172 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
173 } else {
174 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
175 dest.indirect_offset);
176 /* even ignored in da16, still need to set as '01' */
177 brw_inst_set_dst_hstride(devinfo, inst, 1);
178 }
179 }
180 }
181
182 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
183 * or 16 (SIMD16), as that's normally correct. However, when dealing with
184 * small registers, it can be useful for us to automatically reduce it to
185 * match the register size.
186 */
187 if (p->automatic_exec_sizes) {
188 /*
189 * In platforms that support fp64 we can emit instructions with a width
190 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
191 * these cases we need to make sure that these instructions have their
192 * exec sizes set properly when they are emitted and we can't rely on
193 * this code to fix it.
194 */
195 bool fix_exec_size;
196 if (devinfo->gen >= 6)
197 fix_exec_size = dest.width < BRW_EXECUTE_4;
198 else
199 fix_exec_size = dest.width < BRW_EXECUTE_8;
200
201 if (fix_exec_size)
202 brw_inst_set_exec_size(devinfo, inst, dest.width);
203 }
204 }
205
206 void
207 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
208 {
209 const struct gen_device_info *devinfo = p->devinfo;
210
211 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
212 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
213 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
214 assert(reg.nr < 128);
215
216 gen7_convert_mrf_to_grf(p, &reg);
217
218 if (devinfo->gen >= 6 &&
219 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
220 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
221 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
223 /* Any source modifiers or regions will be ignored, since this just
224 * identifies the MRF/GRF to start reading the message contents from.
225 * Check for some likely failures.
226 */
227 assert(!reg.negate);
228 assert(!reg.abs);
229 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
230 }
231
232 if (devinfo->gen >= 12 &&
233 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
234 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
235 assert(reg.file != BRW_IMMEDIATE_VALUE);
236 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
237 assert(reg.subnr == 0);
238 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
239 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
240 reg.vstride == reg.width + 1));
241 assert(!reg.negate && !reg.abs);
242 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
243 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
244
245 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
246 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
247 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
248 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
249 assert(reg.subnr % 16 == 0);
250 assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
251 reg.vstride == reg.width + 1);
252 assert(!reg.negate && !reg.abs);
253 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
254 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
255 } else {
256 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
257 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
258 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
259 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
260
261 if (reg.file == BRW_IMMEDIATE_VALUE) {
262 if (reg.type == BRW_REGISTER_TYPE_DF ||
263 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
264 brw_inst_set_imm_df(devinfo, inst, reg.df);
265 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
266 reg.type == BRW_REGISTER_TYPE_Q)
267 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
268 else
269 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
270
271 if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
272 brw_inst_set_src1_reg_file(devinfo, inst,
273 BRW_ARCHITECTURE_REGISTER_FILE);
274 brw_inst_set_src1_reg_hw_type(devinfo, inst,
275 brw_inst_src0_reg_hw_type(devinfo, inst));
276 }
277 } else {
278 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
279 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
280 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
281 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
282 } else {
283 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
284 }
285 } else {
286 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
287
288 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
289 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
290 } else {
291 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
292 }
293 }
294
295 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
296 if (reg.width == BRW_WIDTH_1 &&
297 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
298 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
299 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
300 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
301 } else {
302 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
303 brw_inst_set_src0_width(devinfo, inst, reg.width);
304 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
305 }
306 } else {
307 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
308 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
309 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
310 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
311 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
312 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
313 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
314 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
315
316 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
317 /* This is an oddity of the fact we're using the same
318 * descriptions for registers in align_16 as align_1:
319 */
320 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
321 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
322 reg.type == BRW_REGISTER_TYPE_DF &&
323 reg.vstride == BRW_VERTICAL_STRIDE_2) {
324 /* From SNB PRM:
325 *
326 * "For Align16 access mode, only encodings of 0000 and 0011
327 * are allowed. Other codes are reserved."
328 *
329 * Presumably the DevSNB behavior applies to IVB as well.
330 */
331 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
332 } else {
333 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
334 }
335 }
336 }
337 }
338 }
339
340
341 void
342 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
343 {
344 const struct gen_device_info *devinfo = p->devinfo;
345
346 if (reg.file == BRW_GENERAL_REGISTER_FILE)
347 assert(reg.nr < 128);
348
349 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
350 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
351 (devinfo->gen >= 12 &&
352 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
353 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
354 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
355 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
356 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
357 assert(reg.subnr == 0);
358 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
359 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
360 reg.vstride == reg.width + 1));
361 assert(!reg.negate && !reg.abs);
362 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
363 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
364 } else {
365 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
366 *
367 * "Accumulator registers may be accessed explicitly as src0
368 * operands only."
369 */
370 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
371 reg.nr != BRW_ARF_ACCUMULATOR);
372
373 gen7_convert_mrf_to_grf(p, &reg);
374 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
375
376 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
377 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
378 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
379
380 /* Only src1 can be immediate in two-argument instructions.
381 */
382 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
383
384 if (reg.file == BRW_IMMEDIATE_VALUE) {
385 /* two-argument instructions can only use 32-bit immediates */
386 assert(type_sz(reg.type) < 8);
387 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
388 } else {
389 /* This is a hardware restriction, which may or may not be lifted
390 * in the future:
391 */
392 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
393 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
394
395 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
396 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
397 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
398 } else {
399 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
400 }
401
402 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
403 if (reg.width == BRW_WIDTH_1 &&
404 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
405 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
406 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
407 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
408 } else {
409 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
410 brw_inst_set_src1_width(devinfo, inst, reg.width);
411 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
412 }
413 } else {
414 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
415 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
416 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
417 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
418 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
419 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
420 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
421 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
422
423 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
424 /* This is an oddity of the fact we're using the same
425 * descriptions for registers in align_16 as align_1:
426 */
427 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
428 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
429 reg.type == BRW_REGISTER_TYPE_DF &&
430 reg.vstride == BRW_VERTICAL_STRIDE_2) {
431 /* From SNB PRM:
432 *
433 * "For Align16 access mode, only encodings of 0000 and 0011
434 * are allowed. Other codes are reserved."
435 *
436 * Presumably the DevSNB behavior applies to IVB as well.
437 */
438 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
439 } else {
440 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
441 }
442 }
443 }
444 }
445 }
446
447 /**
448 * Specify the descriptor and extended descriptor immediate for a SEND(C)
449 * message instruction.
450 */
451 void
452 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
453 unsigned desc, unsigned ex_desc)
454 {
455 const struct gen_device_info *devinfo = p->devinfo;
456 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
457 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
458 if (devinfo->gen < 12)
459 brw_inst_set_src1_file_type(devinfo, inst,
460 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
461 brw_inst_set_send_desc(devinfo, inst, desc);
462 if (devinfo->gen >= 9)
463 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
464 }
465
466 static void brw_set_math_message( struct brw_codegen *p,
467 brw_inst *inst,
468 unsigned function,
469 unsigned integer_type,
470 bool low_precision,
471 unsigned dataType )
472 {
473 const struct gen_device_info *devinfo = p->devinfo;
474 unsigned msg_length;
475 unsigned response_length;
476
477 /* Infer message length from the function */
478 switch (function) {
479 case BRW_MATH_FUNCTION_POW:
480 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
481 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
482 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
483 msg_length = 2;
484 break;
485 default:
486 msg_length = 1;
487 break;
488 }
489
490 /* Infer response length from the function */
491 switch (function) {
492 case BRW_MATH_FUNCTION_SINCOS:
493 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
494 response_length = 2;
495 break;
496 default:
497 response_length = 1;
498 break;
499 }
500
501 brw_set_desc(p, inst, brw_message_desc(
502 devinfo, msg_length, response_length, false));
503
504 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
505 brw_inst_set_math_msg_function(devinfo, inst, function);
506 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
507 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
508 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
509 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
510 brw_inst_set_saturate(devinfo, inst, 0);
511 }
512
513
514 static void brw_set_ff_sync_message(struct brw_codegen *p,
515 brw_inst *insn,
516 bool allocate,
517 unsigned response_length,
518 bool end_of_thread)
519 {
520 const struct gen_device_info *devinfo = p->devinfo;
521
522 brw_set_desc(p, insn, brw_message_desc(
523 devinfo, 1, response_length, true));
524
525 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
526 brw_inst_set_eot(devinfo, insn, end_of_thread);
527 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
528 brw_inst_set_urb_allocate(devinfo, insn, allocate);
529 /* The following fields are not used by FF_SYNC: */
530 brw_inst_set_urb_global_offset(devinfo, insn, 0);
531 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
532 brw_inst_set_urb_used(devinfo, insn, 0);
533 brw_inst_set_urb_complete(devinfo, insn, 0);
534 }
535
536 static void brw_set_urb_message( struct brw_codegen *p,
537 brw_inst *insn,
538 enum brw_urb_write_flags flags,
539 unsigned msg_length,
540 unsigned response_length,
541 unsigned offset,
542 unsigned swizzle_control )
543 {
544 const struct gen_device_info *devinfo = p->devinfo;
545
546 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
547 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
548 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
549
550 brw_set_desc(p, insn, brw_message_desc(
551 devinfo, msg_length, response_length, true));
552
553 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
554 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
555
556 if (flags & BRW_URB_WRITE_OWORD) {
557 assert(msg_length == 2); /* header + one OWORD of data */
558 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
559 } else {
560 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
561 }
562
563 brw_inst_set_urb_global_offset(devinfo, insn, offset);
564 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
565
566 if (devinfo->gen < 8) {
567 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
568 }
569
570 if (devinfo->gen < 7) {
571 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
572 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
573 } else {
574 brw_inst_set_urb_per_slot_offset(devinfo, insn,
575 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
576 }
577 }
578
579 static void
580 gen7_set_dp_scratch_message(struct brw_codegen *p,
581 brw_inst *inst,
582 bool write,
583 bool dword,
584 bool invalidate_after_read,
585 unsigned num_regs,
586 unsigned addr_offset,
587 unsigned mlen,
588 unsigned rlen,
589 bool header_present)
590 {
591 const struct gen_device_info *devinfo = p->devinfo;
592 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
593 (devinfo->gen >= 8 && num_regs == 8));
594 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
595 num_regs - 1);
596
597 brw_set_desc(p, inst, brw_message_desc(
598 devinfo, mlen, rlen, header_present));
599
600 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
601 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
602 brw_inst_set_scratch_read_write(devinfo, inst, write);
603 brw_inst_set_scratch_type(devinfo, inst, dword);
604 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
605 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
606 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
607 }
608
609 static void
610 brw_inst_set_state(const struct gen_device_info *devinfo,
611 brw_inst *insn,
612 const struct brw_insn_state *state)
613 {
614 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
615 brw_inst_set_group(devinfo, insn, state->group);
616 brw_inst_set_compression(devinfo, insn, state->compressed);
617 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
618 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
619 brw_inst_set_saturate(devinfo, insn, state->saturate);
620 brw_inst_set_pred_control(devinfo, insn, state->predicate);
621 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
622
623 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
624 state->access_mode == BRW_ALIGN_16) {
625 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
626 if (devinfo->gen >= 7)
627 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
628 } else {
629 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
630 if (devinfo->gen >= 7)
631 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
632 }
633
634 if (devinfo->gen >= 6)
635 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
636 }
637
638 #define next_insn brw_next_insn
639 brw_inst *
640 brw_next_insn(struct brw_codegen *p, unsigned opcode)
641 {
642 const struct gen_device_info *devinfo = p->devinfo;
643 brw_inst *insn;
644
645 if (p->nr_insn + 1 > p->store_size) {
646 p->store_size <<= 1;
647 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
648 }
649
650 p->next_insn_offset += 16;
651 insn = &p->store[p->nr_insn++];
652
653 memset(insn, 0, sizeof(*insn));
654 brw_inst_set_opcode(devinfo, insn, opcode);
655
656 /* Apply the default instruction state */
657 brw_inst_set_state(devinfo, insn, p->current);
658
659 return insn;
660 }
661
662 static brw_inst *
663 brw_alu1(struct brw_codegen *p, unsigned opcode,
664 struct brw_reg dest, struct brw_reg src)
665 {
666 brw_inst *insn = next_insn(p, opcode);
667 brw_set_dest(p, insn, dest);
668 brw_set_src0(p, insn, src);
669 return insn;
670 }
671
672 static brw_inst *
673 brw_alu2(struct brw_codegen *p, unsigned opcode,
674 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
675 {
676 /* 64-bit immediates are only supported on 1-src instructions */
677 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
678 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
679
680 brw_inst *insn = next_insn(p, opcode);
681 brw_set_dest(p, insn, dest);
682 brw_set_src0(p, insn, src0);
683 brw_set_src1(p, insn, src1);
684 return insn;
685 }
686
687 static int
688 get_3src_subreg_nr(struct brw_reg reg)
689 {
690 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
691 * use 32-bit units (components 0..7). Since they only support F/D/UD
692 * types, this doesn't lose any flexibility, but uses fewer bits.
693 */
694 return reg.subnr / 4;
695 }
696
697 static enum gen10_align1_3src_vertical_stride
698 to_3src_align1_vstride(const struct gen_device_info *devinfo,
699 enum brw_vertical_stride vstride)
700 {
701 switch (vstride) {
702 case BRW_VERTICAL_STRIDE_0:
703 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
704 case BRW_VERTICAL_STRIDE_1:
705 assert(devinfo->gen >= 12);
706 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
707 case BRW_VERTICAL_STRIDE_2:
708 assert(devinfo->gen < 12);
709 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
710 case BRW_VERTICAL_STRIDE_4:
711 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
712 case BRW_VERTICAL_STRIDE_8:
713 case BRW_VERTICAL_STRIDE_16:
714 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
715 default:
716 unreachable("invalid vstride");
717 }
718 }
719
720
721 static enum gen10_align1_3src_src_horizontal_stride
722 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
723 {
724 switch (hstride) {
725 case BRW_HORIZONTAL_STRIDE_0:
726 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
727 case BRW_HORIZONTAL_STRIDE_1:
728 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
729 case BRW_HORIZONTAL_STRIDE_2:
730 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
731 case BRW_HORIZONTAL_STRIDE_4:
732 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
733 default:
734 unreachable("invalid hstride");
735 }
736 }
737
738 static brw_inst *
739 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
740 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
741 {
742 const struct gen_device_info *devinfo = p->devinfo;
743 brw_inst *inst = next_insn(p, opcode);
744
745 gen7_convert_mrf_to_grf(p, &dest);
746
747 assert(dest.nr < 128);
748 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
749 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
750 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
751 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
752 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
753 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
754 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
755
756 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
757 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
758 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
759
760 if (devinfo->gen >= 12) {
761 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
762 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
763 } else {
764 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
765 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
766 BRW_ALIGN1_3SRC_ACCUMULATOR);
767 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
768 } else {
769 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
770 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
771 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
772 }
773 }
774 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
775
776 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
777
778 if (brw_reg_type_is_floating_point(dest.type)) {
779 brw_inst_set_3src_a1_exec_type(devinfo, inst,
780 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
781 } else {
782 brw_inst_set_3src_a1_exec_type(devinfo, inst,
783 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
784 }
785
786 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
787 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
788 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
789 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
790
791 brw_inst_set_3src_a1_src0_vstride(
792 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
793 brw_inst_set_3src_a1_src1_vstride(
794 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
795 /* no vstride on src2 */
796
797 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
798 to_3src_align1_hstride(src0.hstride));
799 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
800 to_3src_align1_hstride(src1.hstride));
801 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
802 to_3src_align1_hstride(src2.hstride));
803
804 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
805 if (src0.type == BRW_REGISTER_TYPE_NF) {
806 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
807 } else {
808 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
809 }
810 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
811 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
812
813 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
814 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
815 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
816 } else {
817 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
818 }
819 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
820 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
821
822 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
823 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
824 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
825 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
826
827 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
828 src0.file == BRW_IMMEDIATE_VALUE ||
829 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
830 src0.type == BRW_REGISTER_TYPE_NF));
831 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
832 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
833 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
834 src2.file == BRW_IMMEDIATE_VALUE);
835
836 if (devinfo->gen >= 12) {
837 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
838 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
839 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
840 } else {
841 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
842 src0.file == BRW_GENERAL_REGISTER_FILE ?
843 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
844 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
845 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
846 src1.file == BRW_GENERAL_REGISTER_FILE ?
847 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
848 BRW_ALIGN1_3SRC_ACCUMULATOR);
849 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
850 src2.file == BRW_GENERAL_REGISTER_FILE ?
851 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
852 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
853 }
854
855 } else {
856 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
857 dest.file == BRW_MESSAGE_REGISTER_FILE);
858 assert(dest.type == BRW_REGISTER_TYPE_F ||
859 dest.type == BRW_REGISTER_TYPE_DF ||
860 dest.type == BRW_REGISTER_TYPE_D ||
861 dest.type == BRW_REGISTER_TYPE_UD ||
862 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
863 if (devinfo->gen == 6) {
864 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
865 dest.file == BRW_MESSAGE_REGISTER_FILE);
866 }
867 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
868 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
869 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
870
871 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
872 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
873 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
874 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
875 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
876 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
877 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
878 src0.vstride == BRW_VERTICAL_STRIDE_0);
879
880 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
881 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
882 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
883 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
884 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
885 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
886 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
887 src1.vstride == BRW_VERTICAL_STRIDE_0);
888
889 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
890 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
891 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
892 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
893 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
894 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
895 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
896 src2.vstride == BRW_VERTICAL_STRIDE_0);
897
898 if (devinfo->gen >= 7) {
899 /* Set both the source and destination types based on dest.type,
900 * ignoring the source register types. The MAD and LRP emitters ensure
901 * that all four types are float. The BFE and BFI2 emitters, however,
902 * may send us mixed D and UD types and want us to ignore that and use
903 * the destination type.
904 */
905 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
906 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
907
908 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
909 *
910 * "Three source instructions can use operands with mixed-mode
911 * precision. When SrcType field is set to :f or :hf it defines
912 * precision for source 0 only, and fields Src1Type and Src2Type
913 * define precision for other source operands:
914 *
915 * 0b = :f. Single precision Float (32-bit).
916 * 1b = :hf. Half precision Float (16-bit)."
917 */
918 if (src1.type == BRW_REGISTER_TYPE_HF)
919 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
920
921 if (src2.type == BRW_REGISTER_TYPE_HF)
922 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
923 }
924 }
925
926 return inst;
927 }
928
929
930 /***********************************************************************
931 * Convenience routines.
932 */
933 #define ALU1(OP) \
934 brw_inst *brw_##OP(struct brw_codegen *p, \
935 struct brw_reg dest, \
936 struct brw_reg src0) \
937 { \
938 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
939 }
940
941 #define ALU2(OP) \
942 brw_inst *brw_##OP(struct brw_codegen *p, \
943 struct brw_reg dest, \
944 struct brw_reg src0, \
945 struct brw_reg src1) \
946 { \
947 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
948 }
949
950 #define ALU3(OP) \
951 brw_inst *brw_##OP(struct brw_codegen *p, \
952 struct brw_reg dest, \
953 struct brw_reg src0, \
954 struct brw_reg src1, \
955 struct brw_reg src2) \
956 { \
957 if (p->current->access_mode == BRW_ALIGN_16) { \
958 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
959 src0.swizzle = BRW_SWIZZLE_XXXX; \
960 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
961 src1.swizzle = BRW_SWIZZLE_XXXX; \
962 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
963 src2.swizzle = BRW_SWIZZLE_XXXX; \
964 } \
965 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
966 }
967
968 #define ALU3F(OP) \
969 brw_inst *brw_##OP(struct brw_codegen *p, \
970 struct brw_reg dest, \
971 struct brw_reg src0, \
972 struct brw_reg src1, \
973 struct brw_reg src2) \
974 { \
975 assert(dest.type == BRW_REGISTER_TYPE_F || \
976 dest.type == BRW_REGISTER_TYPE_DF); \
977 if (dest.type == BRW_REGISTER_TYPE_F) { \
978 assert(src0.type == BRW_REGISTER_TYPE_F); \
979 assert(src1.type == BRW_REGISTER_TYPE_F); \
980 assert(src2.type == BRW_REGISTER_TYPE_F); \
981 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
982 assert(src0.type == BRW_REGISTER_TYPE_DF); \
983 assert(src1.type == BRW_REGISTER_TYPE_DF); \
984 assert(src2.type == BRW_REGISTER_TYPE_DF); \
985 } \
986 \
987 if (p->current->access_mode == BRW_ALIGN_16) { \
988 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
989 src0.swizzle = BRW_SWIZZLE_XXXX; \
990 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
991 src1.swizzle = BRW_SWIZZLE_XXXX; \
992 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
993 src2.swizzle = BRW_SWIZZLE_XXXX; \
994 } \
995 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
996 }
997
998 /* Rounding operations (other than RNDD) require two instructions - the first
999 * stores a rounded value (possibly the wrong way) in the dest register, but
1000 * also sets a per-channel "increment bit" in the flag register. A predicated
1001 * add of 1.0 fixes dest to contain the desired result.
1002 *
1003 * Sandybridge and later appear to round correctly without an ADD.
1004 */
1005 #define ROUND(OP) \
1006 void brw_##OP(struct brw_codegen *p, \
1007 struct brw_reg dest, \
1008 struct brw_reg src) \
1009 { \
1010 const struct gen_device_info *devinfo = p->devinfo; \
1011 brw_inst *rnd, *add; \
1012 rnd = next_insn(p, BRW_OPCODE_##OP); \
1013 brw_set_dest(p, rnd, dest); \
1014 brw_set_src0(p, rnd, src); \
1015 \
1016 if (devinfo->gen < 6) { \
1017 /* turn on round-increments */ \
1018 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1019 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1020 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1021 } \
1022 }
1023
1024
1025 ALU2(SEL)
1026 ALU1(NOT)
1027 ALU2(AND)
1028 ALU2(OR)
1029 ALU2(XOR)
1030 ALU2(SHR)
1031 ALU2(SHL)
1032 ALU1(DIM)
1033 ALU2(ASR)
1034 ALU2(ROL)
1035 ALU2(ROR)
1036 ALU3(CSEL)
1037 ALU1(FRC)
1038 ALU1(RNDD)
1039 ALU2(MAC)
1040 ALU2(MACH)
1041 ALU1(LZD)
1042 ALU2(DP4)
1043 ALU2(DPH)
1044 ALU2(DP3)
1045 ALU2(DP2)
1046 ALU3(MAD)
1047 ALU3F(LRP)
1048 ALU1(BFREV)
1049 ALU3(BFE)
1050 ALU2(BFI1)
1051 ALU3(BFI2)
1052 ALU1(FBH)
1053 ALU1(FBL)
1054 ALU1(CBIT)
1055 ALU2(ADDC)
1056 ALU2(SUBB)
1057
1058 ROUND(RNDZ)
1059 ROUND(RNDE)
1060
1061 brw_inst *
1062 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1063 {
1064 const struct gen_device_info *devinfo = p->devinfo;
1065
1066 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1067 * To avoid the problems that causes, we use an <X,2,0> source region to
1068 * read each element twice.
1069 */
1070 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1071 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1072 dest.type == BRW_REGISTER_TYPE_DF &&
1073 (src0.type == BRW_REGISTER_TYPE_F ||
1074 src0.type == BRW_REGISTER_TYPE_D ||
1075 src0.type == BRW_REGISTER_TYPE_UD) &&
1076 !has_scalar_region(src0)) {
1077 assert(src0.vstride == src0.width + src0.hstride);
1078 src0.vstride = src0.hstride;
1079 src0.width = BRW_WIDTH_2;
1080 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1081 }
1082
1083 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1084 }
1085
1086 brw_inst *
1087 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1088 struct brw_reg src0, struct brw_reg src1)
1089 {
1090 /* 6.2.2: add */
1091 if (src0.type == BRW_REGISTER_TYPE_F ||
1092 (src0.file == BRW_IMMEDIATE_VALUE &&
1093 src0.type == BRW_REGISTER_TYPE_VF)) {
1094 assert(src1.type != BRW_REGISTER_TYPE_UD);
1095 assert(src1.type != BRW_REGISTER_TYPE_D);
1096 }
1097
1098 if (src1.type == BRW_REGISTER_TYPE_F ||
1099 (src1.file == BRW_IMMEDIATE_VALUE &&
1100 src1.type == BRW_REGISTER_TYPE_VF)) {
1101 assert(src0.type != BRW_REGISTER_TYPE_UD);
1102 assert(src0.type != BRW_REGISTER_TYPE_D);
1103 }
1104
1105 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1106 }
1107
1108 brw_inst *
1109 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1110 struct brw_reg src0, struct brw_reg src1)
1111 {
1112 assert(dest.type == src0.type);
1113 assert(src0.type == src1.type);
1114 switch (src0.type) {
1115 case BRW_REGISTER_TYPE_B:
1116 case BRW_REGISTER_TYPE_UB:
1117 case BRW_REGISTER_TYPE_W:
1118 case BRW_REGISTER_TYPE_UW:
1119 case BRW_REGISTER_TYPE_D:
1120 case BRW_REGISTER_TYPE_UD:
1121 break;
1122 default:
1123 unreachable("Bad type for brw_AVG");
1124 }
1125
1126 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1127 }
1128
1129 brw_inst *
1130 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1131 struct brw_reg src0, struct brw_reg src1)
1132 {
1133 /* 6.32.38: mul */
1134 if (src0.type == BRW_REGISTER_TYPE_D ||
1135 src0.type == BRW_REGISTER_TYPE_UD ||
1136 src1.type == BRW_REGISTER_TYPE_D ||
1137 src1.type == BRW_REGISTER_TYPE_UD) {
1138 assert(dest.type != BRW_REGISTER_TYPE_F);
1139 }
1140
1141 if (src0.type == BRW_REGISTER_TYPE_F ||
1142 (src0.file == BRW_IMMEDIATE_VALUE &&
1143 src0.type == BRW_REGISTER_TYPE_VF)) {
1144 assert(src1.type != BRW_REGISTER_TYPE_UD);
1145 assert(src1.type != BRW_REGISTER_TYPE_D);
1146 }
1147
1148 if (src1.type == BRW_REGISTER_TYPE_F ||
1149 (src1.file == BRW_IMMEDIATE_VALUE &&
1150 src1.type == BRW_REGISTER_TYPE_VF)) {
1151 assert(src0.type != BRW_REGISTER_TYPE_UD);
1152 assert(src0.type != BRW_REGISTER_TYPE_D);
1153 }
1154
1155 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1156 src0.nr != BRW_ARF_ACCUMULATOR);
1157 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1158 src1.nr != BRW_ARF_ACCUMULATOR);
1159
1160 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1161 }
1162
1163 brw_inst *
1164 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1165 struct brw_reg src0, struct brw_reg src1)
1166 {
1167 src0.vstride = BRW_VERTICAL_STRIDE_0;
1168 src0.width = BRW_WIDTH_1;
1169 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1170 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1171 }
1172
1173 brw_inst *
1174 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1175 struct brw_reg src0, struct brw_reg src1)
1176 {
1177 src0.vstride = BRW_VERTICAL_STRIDE_0;
1178 src0.width = BRW_WIDTH_1;
1179 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1180 src1.vstride = BRW_VERTICAL_STRIDE_8;
1181 src1.width = BRW_WIDTH_8;
1182 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1183 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1184 }
1185
1186 brw_inst *
1187 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1188 {
1189 const struct gen_device_info *devinfo = p->devinfo;
1190 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1191 /* The F32TO16 instruction doesn't support 32-bit destination types in
1192 * Align1 mode, and neither does the Gen8 implementation in terms of a
1193 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1194 * an undocumented feature.
1195 */
1196 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1197 (!align16 || devinfo->gen >= 8));
1198 brw_inst *inst;
1199
1200 if (align16) {
1201 assert(dst.type == BRW_REGISTER_TYPE_UD);
1202 } else {
1203 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1204 dst.type == BRW_REGISTER_TYPE_W ||
1205 dst.type == BRW_REGISTER_TYPE_UW ||
1206 dst.type == BRW_REGISTER_TYPE_HF);
1207 }
1208
1209 brw_push_insn_state(p);
1210
1211 if (needs_zero_fill) {
1212 brw_set_default_access_mode(p, BRW_ALIGN_1);
1213 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1214 }
1215
1216 if (devinfo->gen >= 8) {
1217 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1218 } else {
1219 assert(devinfo->gen == 7);
1220 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1221 }
1222
1223 if (needs_zero_fill) {
1224 brw_inst_set_no_dd_clear(devinfo, inst, true);
1225 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1226 brw_inst_set_no_dd_check(devinfo, inst, true);
1227 }
1228
1229 brw_pop_insn_state(p);
1230 return inst;
1231 }
1232
1233 brw_inst *
1234 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1235 {
1236 const struct gen_device_info *devinfo = p->devinfo;
1237 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1238
1239 if (align16) {
1240 assert(src.type == BRW_REGISTER_TYPE_UD);
1241 } else {
1242 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1243 *
1244 * Because this instruction does not have a 16-bit floating-point
1245 * type, the source data type must be Word (W). The destination type
1246 * must be F (Float).
1247 */
1248 if (src.type == BRW_REGISTER_TYPE_UD)
1249 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1250
1251 assert(src.type == BRW_REGISTER_TYPE_W ||
1252 src.type == BRW_REGISTER_TYPE_UW ||
1253 src.type == BRW_REGISTER_TYPE_HF);
1254 }
1255
1256 if (devinfo->gen >= 8) {
1257 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1258 } else {
1259 assert(devinfo->gen == 7);
1260 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1261 }
1262 }
1263
1264
1265 void brw_NOP(struct brw_codegen *p)
1266 {
1267 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1268 memset(insn, 0, sizeof(*insn));
1269 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1270 }
1271
1272
1273
1274
1275
1276 /***********************************************************************
1277 * Comparisons, if/else/endif
1278 */
1279
1280 brw_inst *
1281 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1282 unsigned predicate_control)
1283 {
1284 const struct gen_device_info *devinfo = p->devinfo;
1285 struct brw_reg ip = brw_ip_reg();
1286 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1287
1288 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1289 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1290 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1291 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1292
1293 return inst;
1294 }
1295
1296 static void
1297 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1298 {
1299 p->if_stack[p->if_stack_depth] = inst - p->store;
1300
1301 p->if_stack_depth++;
1302 if (p->if_stack_array_size <= p->if_stack_depth) {
1303 p->if_stack_array_size *= 2;
1304 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1305 p->if_stack_array_size);
1306 }
1307 }
1308
1309 static brw_inst *
1310 pop_if_stack(struct brw_codegen *p)
1311 {
1312 p->if_stack_depth--;
1313 return &p->store[p->if_stack[p->if_stack_depth]];
1314 }
1315
1316 static void
1317 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1318 {
1319 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1320 p->loop_stack_array_size *= 2;
1321 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1322 p->loop_stack_array_size);
1323 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1324 p->loop_stack_array_size);
1325 }
1326
1327 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1328 p->loop_stack_depth++;
1329 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1330 }
1331
1332 static brw_inst *
1333 get_inner_do_insn(struct brw_codegen *p)
1334 {
1335 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1336 }
1337
1338 /* EU takes the value from the flag register and pushes it onto some
1339 * sort of a stack (presumably merging with any flag value already on
1340 * the stack). Within an if block, the flags at the top of the stack
1341 * control execution on each channel of the unit, eg. on each of the
1342 * 16 pixel values in our wm programs.
1343 *
1344 * When the matching 'else' instruction is reached (presumably by
1345 * countdown of the instruction count patched in by our ELSE/ENDIF
1346 * functions), the relevant flags are inverted.
1347 *
1348 * When the matching 'endif' instruction is reached, the flags are
1349 * popped off. If the stack is now empty, normal execution resumes.
1350 */
1351 brw_inst *
1352 brw_IF(struct brw_codegen *p, unsigned execute_size)
1353 {
1354 const struct gen_device_info *devinfo = p->devinfo;
1355 brw_inst *insn;
1356
1357 insn = next_insn(p, BRW_OPCODE_IF);
1358
1359 /* Override the defaults for this instruction:
1360 */
1361 if (devinfo->gen < 6) {
1362 brw_set_dest(p, insn, brw_ip_reg());
1363 brw_set_src0(p, insn, brw_ip_reg());
1364 brw_set_src1(p, insn, brw_imm_d(0x0));
1365 } else if (devinfo->gen == 6) {
1366 brw_set_dest(p, insn, brw_imm_w(0));
1367 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1368 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1369 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1370 } else if (devinfo->gen == 7) {
1371 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1372 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1373 brw_set_src1(p, insn, brw_imm_w(0));
1374 brw_inst_set_jip(devinfo, insn, 0);
1375 brw_inst_set_uip(devinfo, insn, 0);
1376 } else {
1377 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1378 if (devinfo->gen < 12)
1379 brw_set_src0(p, insn, brw_imm_d(0));
1380 brw_inst_set_jip(devinfo, insn, 0);
1381 brw_inst_set_uip(devinfo, insn, 0);
1382 }
1383
1384 brw_inst_set_exec_size(devinfo, insn, execute_size);
1385 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1386 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1387 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1388 if (!p->single_program_flow && devinfo->gen < 6)
1389 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1390
1391 push_if_stack(p, insn);
1392 p->if_depth_in_loop[p->loop_stack_depth]++;
1393 return insn;
1394 }
1395
1396 /* This function is only used for gen6-style IF instructions with an
1397 * embedded comparison (conditional modifier). It is not used on gen7.
1398 */
1399 brw_inst *
1400 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1401 struct brw_reg src0, struct brw_reg src1)
1402 {
1403 const struct gen_device_info *devinfo = p->devinfo;
1404 brw_inst *insn;
1405
1406 insn = next_insn(p, BRW_OPCODE_IF);
1407
1408 brw_set_dest(p, insn, brw_imm_w(0));
1409 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1410 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1411 brw_set_src0(p, insn, src0);
1412 brw_set_src1(p, insn, src1);
1413
1414 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1415 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1416 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1417
1418 push_if_stack(p, insn);
1419 return insn;
1420 }
1421
1422 /**
1423 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1424 */
1425 static void
1426 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1427 brw_inst *if_inst, brw_inst *else_inst)
1428 {
1429 const struct gen_device_info *devinfo = p->devinfo;
1430
1431 /* The next instruction (where the ENDIF would be, if it existed) */
1432 brw_inst *next_inst = &p->store[p->nr_insn];
1433
1434 assert(p->single_program_flow);
1435 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1436 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1437 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1438
1439 /* Convert IF to an ADD instruction that moves the instruction pointer
1440 * to the first instruction of the ELSE block. If there is no ELSE
1441 * block, point to where ENDIF would be. Reverse the predicate.
1442 *
1443 * There's no need to execute an ENDIF since we don't need to do any
1444 * stack operations, and if we're currently executing, we just want to
1445 * continue normally.
1446 */
1447 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1448 brw_inst_set_pred_inv(devinfo, if_inst, true);
1449
1450 if (else_inst != NULL) {
1451 /* Convert ELSE to an ADD instruction that points where the ENDIF
1452 * would be.
1453 */
1454 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1455
1456 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1457 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1458 } else {
1459 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1460 }
1461 }
1462
1463 /**
1464 * Patch IF and ELSE instructions with appropriate jump targets.
1465 */
1466 static void
1467 patch_IF_ELSE(struct brw_codegen *p,
1468 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1469 {
1470 const struct gen_device_info *devinfo = p->devinfo;
1471
1472 /* We shouldn't be patching IF and ELSE instructions in single program flow
1473 * mode when gen < 6, because in single program flow mode on those
1474 * platforms, we convert flow control instructions to conditional ADDs that
1475 * operate on IP (see brw_ENDIF).
1476 *
1477 * However, on Gen6, writing to IP doesn't work in single program flow mode
1478 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1479 * not be updated by non-flow control instructions."). And on later
1480 * platforms, there is no significant benefit to converting control flow
1481 * instructions to conditional ADDs. So we do patch IF and ELSE
1482 * instructions in single program flow mode on those platforms.
1483 */
1484 if (devinfo->gen < 6)
1485 assert(!p->single_program_flow);
1486
1487 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1488 assert(endif_inst != NULL);
1489 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1490
1491 unsigned br = brw_jump_scale(devinfo);
1492
1493 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1494 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1495
1496 if (else_inst == NULL) {
1497 /* Patch IF -> ENDIF */
1498 if (devinfo->gen < 6) {
1499 /* Turn it into an IFF, which means no mask stack operations for
1500 * all-false and jumping past the ENDIF.
1501 */
1502 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1503 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1504 br * (endif_inst - if_inst + 1));
1505 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1506 } else if (devinfo->gen == 6) {
1507 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1508 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1509 } else {
1510 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1511 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1512 }
1513 } else {
1514 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1515
1516 /* Patch IF -> ELSE */
1517 if (devinfo->gen < 6) {
1518 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1519 br * (else_inst - if_inst));
1520 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1521 } else if (devinfo->gen == 6) {
1522 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1523 br * (else_inst - if_inst + 1));
1524 }
1525
1526 /* Patch ELSE -> ENDIF */
1527 if (devinfo->gen < 6) {
1528 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1529 * matching ENDIF.
1530 */
1531 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1532 br * (endif_inst - else_inst + 1));
1533 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1534 } else if (devinfo->gen == 6) {
1535 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1536 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1537 br * (endif_inst - else_inst));
1538 } else {
1539 /* The IF instruction's JIP should point just past the ELSE */
1540 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1541 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1542 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1543 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1544 if (devinfo->gen >= 8) {
1545 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1546 * should point to ENDIF.
1547 */
1548 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1549 }
1550 }
1551 }
1552 }
1553
1554 void
1555 brw_ELSE(struct brw_codegen *p)
1556 {
1557 const struct gen_device_info *devinfo = p->devinfo;
1558 brw_inst *insn;
1559
1560 insn = next_insn(p, BRW_OPCODE_ELSE);
1561
1562 if (devinfo->gen < 6) {
1563 brw_set_dest(p, insn, brw_ip_reg());
1564 brw_set_src0(p, insn, brw_ip_reg());
1565 brw_set_src1(p, insn, brw_imm_d(0x0));
1566 } else if (devinfo->gen == 6) {
1567 brw_set_dest(p, insn, brw_imm_w(0));
1568 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1569 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1570 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1571 } else if (devinfo->gen == 7) {
1572 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1573 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1574 brw_set_src1(p, insn, brw_imm_w(0));
1575 brw_inst_set_jip(devinfo, insn, 0);
1576 brw_inst_set_uip(devinfo, insn, 0);
1577 } else {
1578 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1579 if (devinfo->gen < 12)
1580 brw_set_src0(p, insn, brw_imm_d(0));
1581 brw_inst_set_jip(devinfo, insn, 0);
1582 brw_inst_set_uip(devinfo, insn, 0);
1583 }
1584
1585 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1586 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1587 if (!p->single_program_flow && devinfo->gen < 6)
1588 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1589
1590 push_if_stack(p, insn);
1591 }
1592
1593 void
1594 brw_ENDIF(struct brw_codegen *p)
1595 {
1596 const struct gen_device_info *devinfo = p->devinfo;
1597 brw_inst *insn = NULL;
1598 brw_inst *else_inst = NULL;
1599 brw_inst *if_inst = NULL;
1600 brw_inst *tmp;
1601 bool emit_endif = true;
1602
1603 /* In single program flow mode, we can express IF and ELSE instructions
1604 * equivalently as ADD instructions that operate on IP. On platforms prior
1605 * to Gen6, flow control instructions cause an implied thread switch, so
1606 * this is a significant savings.
1607 *
1608 * However, on Gen6, writing to IP doesn't work in single program flow mode
1609 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1610 * not be updated by non-flow control instructions."). And on later
1611 * platforms, there is no significant benefit to converting control flow
1612 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1613 * Gen5.
1614 */
1615 if (devinfo->gen < 6 && p->single_program_flow)
1616 emit_endif = false;
1617
1618 /*
1619 * A single next_insn() may change the base address of instruction store
1620 * memory(p->store), so call it first before referencing the instruction
1621 * store pointer from an index
1622 */
1623 if (emit_endif)
1624 insn = next_insn(p, BRW_OPCODE_ENDIF);
1625
1626 /* Pop the IF and (optional) ELSE instructions from the stack */
1627 p->if_depth_in_loop[p->loop_stack_depth]--;
1628 tmp = pop_if_stack(p);
1629 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1630 else_inst = tmp;
1631 tmp = pop_if_stack(p);
1632 }
1633 if_inst = tmp;
1634
1635 if (!emit_endif) {
1636 /* ENDIF is useless; don't bother emitting it. */
1637 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1638 return;
1639 }
1640
1641 if (devinfo->gen < 6) {
1642 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1643 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1644 brw_set_src1(p, insn, brw_imm_d(0x0));
1645 } else if (devinfo->gen == 6) {
1646 brw_set_dest(p, insn, brw_imm_w(0));
1647 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1649 } else if (devinfo->gen == 7) {
1650 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1651 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1652 brw_set_src1(p, insn, brw_imm_w(0));
1653 } else {
1654 brw_set_src0(p, insn, brw_imm_d(0));
1655 }
1656
1657 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1658 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1659 if (devinfo->gen < 6)
1660 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1661
1662 /* Also pop item off the stack in the endif instruction: */
1663 if (devinfo->gen < 6) {
1664 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1665 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1666 } else if (devinfo->gen == 6) {
1667 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1668 } else {
1669 brw_inst_set_jip(devinfo, insn, 2);
1670 }
1671 patch_IF_ELSE(p, if_inst, else_inst, insn);
1672 }
1673
1674 brw_inst *
1675 brw_BREAK(struct brw_codegen *p)
1676 {
1677 const struct gen_device_info *devinfo = p->devinfo;
1678 brw_inst *insn;
1679
1680 insn = next_insn(p, BRW_OPCODE_BREAK);
1681 if (devinfo->gen >= 8) {
1682 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1683 brw_set_src0(p, insn, brw_imm_d(0x0));
1684 } else if (devinfo->gen >= 6) {
1685 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1686 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1687 brw_set_src1(p, insn, brw_imm_d(0x0));
1688 } else {
1689 brw_set_dest(p, insn, brw_ip_reg());
1690 brw_set_src0(p, insn, brw_ip_reg());
1691 brw_set_src1(p, insn, brw_imm_d(0x0));
1692 brw_inst_set_gen4_pop_count(devinfo, insn,
1693 p->if_depth_in_loop[p->loop_stack_depth]);
1694 }
1695 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1696 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1697
1698 return insn;
1699 }
1700
1701 brw_inst *
1702 brw_CONT(struct brw_codegen *p)
1703 {
1704 const struct gen_device_info *devinfo = p->devinfo;
1705 brw_inst *insn;
1706
1707 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1708 brw_set_dest(p, insn, brw_ip_reg());
1709 if (devinfo->gen >= 8) {
1710 brw_set_src0(p, insn, brw_imm_d(0x0));
1711 } else {
1712 brw_set_src0(p, insn, brw_ip_reg());
1713 brw_set_src1(p, insn, brw_imm_d(0x0));
1714 }
1715
1716 if (devinfo->gen < 6) {
1717 brw_inst_set_gen4_pop_count(devinfo, insn,
1718 p->if_depth_in_loop[p->loop_stack_depth]);
1719 }
1720 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1721 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1722 return insn;
1723 }
1724
1725 brw_inst *
1726 gen6_HALT(struct brw_codegen *p)
1727 {
1728 const struct gen_device_info *devinfo = p->devinfo;
1729 brw_inst *insn;
1730
1731 insn = next_insn(p, BRW_OPCODE_HALT);
1732 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1733 if (devinfo->gen < 8) {
1734 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1735 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1736 } else if (devinfo->gen < 12) {
1737 brw_set_src0(p, insn, brw_imm_d(0x0));
1738 }
1739
1740 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1741 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1742 return insn;
1743 }
1744
1745 /* DO/WHILE loop:
1746 *
1747 * The DO/WHILE is just an unterminated loop -- break or continue are
1748 * used for control within the loop. We have a few ways they can be
1749 * done.
1750 *
1751 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1752 * jip and no DO instruction.
1753 *
1754 * For non-uniform control flow pre-gen6, there's a DO instruction to
1755 * push the mask, and a WHILE to jump back, and BREAK to get out and
1756 * pop the mask.
1757 *
1758 * For gen6, there's no more mask stack, so no need for DO. WHILE
1759 * just points back to the first instruction of the loop.
1760 */
1761 brw_inst *
1762 brw_DO(struct brw_codegen *p, unsigned execute_size)
1763 {
1764 const struct gen_device_info *devinfo = p->devinfo;
1765
1766 if (devinfo->gen >= 6 || p->single_program_flow) {
1767 push_loop_stack(p, &p->store[p->nr_insn]);
1768 return &p->store[p->nr_insn];
1769 } else {
1770 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1771
1772 push_loop_stack(p, insn);
1773
1774 /* Override the defaults for this instruction:
1775 */
1776 brw_set_dest(p, insn, brw_null_reg());
1777 brw_set_src0(p, insn, brw_null_reg());
1778 brw_set_src1(p, insn, brw_null_reg());
1779
1780 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1781 brw_inst_set_exec_size(devinfo, insn, execute_size);
1782 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1783
1784 return insn;
1785 }
1786 }
1787
1788 /**
1789 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1790 * instruction here.
1791 *
1792 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1793 * nesting, since it can always just point to the end of the block/current loop.
1794 */
1795 static void
1796 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1797 {
1798 const struct gen_device_info *devinfo = p->devinfo;
1799 brw_inst *do_inst = get_inner_do_insn(p);
1800 brw_inst *inst;
1801 unsigned br = brw_jump_scale(devinfo);
1802
1803 assert(devinfo->gen < 6);
1804
1805 for (inst = while_inst - 1; inst != do_inst; inst--) {
1806 /* If the jump count is != 0, that means that this instruction has already
1807 * been patched because it's part of a loop inside of the one we're
1808 * patching.
1809 */
1810 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1811 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1812 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1813 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1814 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1815 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1816 }
1817 }
1818 }
1819
1820 brw_inst *
1821 brw_WHILE(struct brw_codegen *p)
1822 {
1823 const struct gen_device_info *devinfo = p->devinfo;
1824 brw_inst *insn, *do_insn;
1825 unsigned br = brw_jump_scale(devinfo);
1826
1827 if (devinfo->gen >= 6) {
1828 insn = next_insn(p, BRW_OPCODE_WHILE);
1829 do_insn = get_inner_do_insn(p);
1830
1831 if (devinfo->gen >= 8) {
1832 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1833 if (devinfo->gen < 12)
1834 brw_set_src0(p, insn, brw_imm_d(0));
1835 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1836 } else if (devinfo->gen == 7) {
1837 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1838 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1839 brw_set_src1(p, insn, brw_imm_w(0));
1840 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1841 } else {
1842 brw_set_dest(p, insn, brw_imm_w(0));
1843 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1844 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1845 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1846 }
1847
1848 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1849
1850 } else {
1851 if (p->single_program_flow) {
1852 insn = next_insn(p, BRW_OPCODE_ADD);
1853 do_insn = get_inner_do_insn(p);
1854
1855 brw_set_dest(p, insn, brw_ip_reg());
1856 brw_set_src0(p, insn, brw_ip_reg());
1857 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1858 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1859 } else {
1860 insn = next_insn(p, BRW_OPCODE_WHILE);
1861 do_insn = get_inner_do_insn(p);
1862
1863 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1864
1865 brw_set_dest(p, insn, brw_ip_reg());
1866 brw_set_src0(p, insn, brw_ip_reg());
1867 brw_set_src1(p, insn, brw_imm_d(0));
1868
1869 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1870 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1871 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1872
1873 brw_patch_break_cont(p, insn);
1874 }
1875 }
1876 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1877
1878 p->loop_stack_depth--;
1879
1880 return insn;
1881 }
1882
1883 /* FORWARD JUMPS:
1884 */
1885 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1886 {
1887 const struct gen_device_info *devinfo = p->devinfo;
1888 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1889 unsigned jmpi = 1;
1890
1891 if (devinfo->gen >= 5)
1892 jmpi = 2;
1893
1894 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1895 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1896
1897 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1898 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1899 }
1900
1901 /* To integrate with the above, it makes sense that the comparison
1902 * instruction should populate the flag register. It might be simpler
1903 * just to use the flag reg for most WM tasks?
1904 */
1905 void brw_CMP(struct brw_codegen *p,
1906 struct brw_reg dest,
1907 unsigned conditional,
1908 struct brw_reg src0,
1909 struct brw_reg src1)
1910 {
1911 const struct gen_device_info *devinfo = p->devinfo;
1912 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1913
1914 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1915 brw_set_dest(p, insn, dest);
1916 brw_set_src0(p, insn, src0);
1917 brw_set_src1(p, insn, src1);
1918
1919 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1920 * page says:
1921 * "Any CMP instruction with a null destination must use a {switch}."
1922 *
1923 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1924 * mentioned on their work-arounds pages.
1925 */
1926 if (devinfo->gen == 7) {
1927 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1928 dest.nr == BRW_ARF_NULL) {
1929 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1930 }
1931 }
1932 }
1933
1934 /***********************************************************************
1935 * Helpers for the various SEND message types:
1936 */
1937
1938 /** Extended math function, float[8].
1939 */
1940 void gen4_math(struct brw_codegen *p,
1941 struct brw_reg dest,
1942 unsigned function,
1943 unsigned msg_reg_nr,
1944 struct brw_reg src,
1945 unsigned precision )
1946 {
1947 const struct gen_device_info *devinfo = p->devinfo;
1948 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1949 unsigned data_type;
1950 if (has_scalar_region(src)) {
1951 data_type = BRW_MATH_DATA_SCALAR;
1952 } else {
1953 data_type = BRW_MATH_DATA_VECTOR;
1954 }
1955
1956 assert(devinfo->gen < 6);
1957
1958 /* Example code doesn't set predicate_control for send
1959 * instructions.
1960 */
1961 brw_inst_set_pred_control(devinfo, insn, 0);
1962 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1963
1964 brw_set_dest(p, insn, dest);
1965 brw_set_src0(p, insn, src);
1966 brw_set_math_message(p,
1967 insn,
1968 function,
1969 src.type == BRW_REGISTER_TYPE_D,
1970 precision,
1971 data_type);
1972 }
1973
1974 void gen6_math(struct brw_codegen *p,
1975 struct brw_reg dest,
1976 unsigned function,
1977 struct brw_reg src0,
1978 struct brw_reg src1)
1979 {
1980 const struct gen_device_info *devinfo = p->devinfo;
1981 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1982
1983 assert(devinfo->gen >= 6);
1984
1985 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1986 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1987
1988 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1989 if (devinfo->gen == 6) {
1990 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1991 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1992 }
1993
1994 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1995 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1996 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1997 assert(src0.type != BRW_REGISTER_TYPE_F);
1998 assert(src1.type != BRW_REGISTER_TYPE_F);
1999 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2000 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2001 } else {
2002 assert(src0.type == BRW_REGISTER_TYPE_F ||
2003 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2004 assert(src1.type == BRW_REGISTER_TYPE_F ||
2005 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2006 }
2007
2008 /* Source modifiers are ignored for extended math instructions on Gen6. */
2009 if (devinfo->gen == 6) {
2010 assert(!src0.negate);
2011 assert(!src0.abs);
2012 assert(!src1.negate);
2013 assert(!src1.abs);
2014 }
2015
2016 brw_inst_set_math_function(devinfo, insn, function);
2017
2018 brw_set_dest(p, insn, dest);
2019 brw_set_src0(p, insn, src0);
2020 brw_set_src1(p, insn, src1);
2021 }
2022
2023 /**
2024 * Return the right surface index to access the thread scratch space using
2025 * stateless dataport messages.
2026 */
2027 unsigned
2028 brw_scratch_surface_idx(const struct brw_codegen *p)
2029 {
2030 /* The scratch space is thread-local so IA coherency is unnecessary. */
2031 if (p->devinfo->gen >= 8)
2032 return GEN8_BTI_STATELESS_NON_COHERENT;
2033 else
2034 return BRW_BTI_STATELESS;
2035 }
2036
2037 /**
2038 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2039 * using a constant offset per channel.
2040 *
2041 * The offset must be aligned to oword size (16 bytes). Used for
2042 * register spilling.
2043 */
2044 void brw_oword_block_write_scratch(struct brw_codegen *p,
2045 struct brw_reg mrf,
2046 int num_regs,
2047 unsigned offset)
2048 {
2049 const struct gen_device_info *devinfo = p->devinfo;
2050 const unsigned target_cache =
2051 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2052 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2053 BRW_SFID_DATAPORT_WRITE);
2054 uint32_t msg_type;
2055
2056 if (devinfo->gen >= 6)
2057 offset /= 16;
2058
2059 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2060
2061 const unsigned mlen = 1 + num_regs;
2062
2063 /* Set up the message header. This is g0, with g0.2 filled with
2064 * the offset. We don't want to leave our offset around in g0 or
2065 * it'll screw up texture samples, so set it up inside the message
2066 * reg.
2067 */
2068 {
2069 brw_push_insn_state(p);
2070 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2071 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2072 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2073
2074 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2075
2076 /* set message header global offset field (reg 0, element 2) */
2077 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2078 brw_MOV(p,
2079 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2080 mrf.nr,
2081 2), BRW_REGISTER_TYPE_UD),
2082 brw_imm_ud(offset));
2083
2084 brw_pop_insn_state(p);
2085 }
2086
2087 {
2088 struct brw_reg dest;
2089 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2090 int send_commit_msg;
2091 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2092 BRW_REGISTER_TYPE_UW);
2093
2094 brw_inst_set_sfid(devinfo, insn, target_cache);
2095 brw_inst_set_compression(devinfo, insn, false);
2096
2097 if (brw_inst_exec_size(devinfo, insn) >= 16)
2098 src_header = vec16(src_header);
2099
2100 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2101 if (devinfo->gen < 6)
2102 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2103
2104 /* Until gen6, writes followed by reads from the same location
2105 * are not guaranteed to be ordered unless write_commit is set.
2106 * If set, then a no-op write is issued to the destination
2107 * register to set a dependency, and a read from the destination
2108 * can be used to ensure the ordering.
2109 *
2110 * For gen6, only writes between different threads need ordering
2111 * protection. Our use of DP writes is all about register
2112 * spilling within a thread.
2113 */
2114 if (devinfo->gen >= 6) {
2115 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2116 send_commit_msg = 0;
2117 } else {
2118 dest = src_header;
2119 send_commit_msg = 1;
2120 }
2121
2122 brw_set_dest(p, insn, dest);
2123 if (devinfo->gen >= 6) {
2124 brw_set_src0(p, insn, mrf);
2125 } else {
2126 brw_set_src0(p, insn, brw_null_reg());
2127 }
2128
2129 if (devinfo->gen >= 6)
2130 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2131 else
2132 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2133
2134 brw_set_desc(p, insn,
2135 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2136 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2137 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2138 msg_type, 0, /* not a render target */
2139 send_commit_msg));
2140 }
2141 }
2142
2143
2144 /**
2145 * Read a block of owords (half a GRF each) from the scratch buffer
2146 * using a constant index per channel.
2147 *
2148 * Offset must be aligned to oword size (16 bytes). Used for register
2149 * spilling.
2150 */
2151 void
2152 brw_oword_block_read_scratch(struct brw_codegen *p,
2153 struct brw_reg dest,
2154 struct brw_reg mrf,
2155 int num_regs,
2156 unsigned offset)
2157 {
2158 const struct gen_device_info *devinfo = p->devinfo;
2159
2160 if (devinfo->gen >= 6)
2161 offset /= 16;
2162
2163 if (p->devinfo->gen >= 7) {
2164 /* On gen 7 and above, we no longer have message registers and we can
2165 * send from any register we want. By using the destination register
2166 * for the message, we guarantee that the implied message write won't
2167 * accidentally overwrite anything. This has been a problem because
2168 * the MRF registers and source for the final FB write are both fixed
2169 * and may overlap.
2170 */
2171 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2172 } else {
2173 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2174 }
2175 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2176
2177 const unsigned rlen = num_regs;
2178 const unsigned target_cache =
2179 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2180 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2181 BRW_SFID_DATAPORT_READ);
2182
2183 {
2184 brw_push_insn_state(p);
2185 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2186 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2187 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2188
2189 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2190
2191 /* set message header global offset field (reg 0, element 2) */
2192 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2193 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2194
2195 brw_pop_insn_state(p);
2196 }
2197
2198 {
2199 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2200
2201 brw_inst_set_sfid(devinfo, insn, target_cache);
2202 assert(brw_inst_pred_control(devinfo, insn) == 0);
2203 brw_inst_set_compression(devinfo, insn, false);
2204
2205 brw_set_dest(p, insn, dest); /* UW? */
2206 if (devinfo->gen >= 6) {
2207 brw_set_src0(p, insn, mrf);
2208 } else {
2209 brw_set_src0(p, insn, brw_null_reg());
2210 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2211 }
2212
2213 brw_set_desc(p, insn,
2214 brw_message_desc(devinfo, 1, rlen, true) |
2215 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2216 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2217 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2218 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2219 }
2220 }
2221
2222 void
2223 gen7_block_read_scratch(struct brw_codegen *p,
2224 struct brw_reg dest,
2225 int num_regs,
2226 unsigned offset)
2227 {
2228 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2229 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2230
2231 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2232
2233 /* The HW requires that the header is present; this is to get the g0.5
2234 * scratch offset.
2235 */
2236 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2237
2238 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2239 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2240 * is 32 bytes, which happens to be the size of a register.
2241 */
2242 offset /= REG_SIZE;
2243 assert(offset < (1 << 12));
2244
2245 gen7_set_dp_scratch_message(p, insn,
2246 false, /* scratch read */
2247 false, /* OWords */
2248 false, /* invalidate after read */
2249 num_regs,
2250 offset,
2251 1, /* mlen: just g0 */
2252 num_regs, /* rlen */
2253 true); /* header present */
2254 }
2255
2256 /**
2257 * Read float[4] vectors from the data port constant cache.
2258 * Location (in buffer) should be a multiple of 16.
2259 * Used for fetching shader constants.
2260 */
2261 void brw_oword_block_read(struct brw_codegen *p,
2262 struct brw_reg dest,
2263 struct brw_reg mrf,
2264 uint32_t offset,
2265 uint32_t bind_table_index)
2266 {
2267 const struct gen_device_info *devinfo = p->devinfo;
2268 const unsigned target_cache =
2269 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2270 BRW_SFID_DATAPORT_READ);
2271 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2272
2273 /* On newer hardware, offset is in units of owords. */
2274 if (devinfo->gen >= 6)
2275 offset /= 16;
2276
2277 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2278
2279 brw_push_insn_state(p);
2280 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2281 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2282 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2283
2284 brw_push_insn_state(p);
2285 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2286 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2287
2288 /* set message header global offset field (reg 0, element 2) */
2289 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2290 brw_MOV(p,
2291 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2292 mrf.nr,
2293 2), BRW_REGISTER_TYPE_UD),
2294 brw_imm_ud(offset));
2295 brw_pop_insn_state(p);
2296
2297 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2298
2299 brw_inst_set_sfid(devinfo, insn, target_cache);
2300
2301 /* cast dest to a uword[8] vector */
2302 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2303
2304 brw_set_dest(p, insn, dest);
2305 if (devinfo->gen >= 6) {
2306 brw_set_src0(p, insn, mrf);
2307 } else {
2308 brw_set_src0(p, insn, brw_null_reg());
2309 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2310 }
2311
2312 brw_set_desc(p, insn,
2313 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2314 brw_dp_read_desc(devinfo, bind_table_index,
2315 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2316 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2317 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2318
2319 brw_pop_insn_state(p);
2320 }
2321
2322 brw_inst *
2323 brw_fb_WRITE(struct brw_codegen *p,
2324 struct brw_reg payload,
2325 struct brw_reg implied_header,
2326 unsigned msg_control,
2327 unsigned binding_table_index,
2328 unsigned msg_length,
2329 unsigned response_length,
2330 bool eot,
2331 bool last_render_target,
2332 bool header_present)
2333 {
2334 const struct gen_device_info *devinfo = p->devinfo;
2335 const unsigned target_cache =
2336 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2337 BRW_SFID_DATAPORT_WRITE);
2338 brw_inst *insn;
2339 unsigned msg_type;
2340 struct brw_reg dest, src0;
2341
2342 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2343 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2344 else
2345 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2346
2347 if (devinfo->gen >= 6) {
2348 insn = next_insn(p, BRW_OPCODE_SENDC);
2349 } else {
2350 insn = next_insn(p, BRW_OPCODE_SEND);
2351 }
2352 brw_inst_set_sfid(devinfo, insn, target_cache);
2353 brw_inst_set_compression(devinfo, insn, false);
2354
2355 if (devinfo->gen >= 6) {
2356 /* headerless version, just submit color payload */
2357 src0 = payload;
2358
2359 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2360 } else {
2361 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2362 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2363 src0 = implied_header;
2364
2365 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2366 }
2367
2368 brw_set_dest(p, insn, dest);
2369 brw_set_src0(p, insn, src0);
2370 brw_set_desc(p, insn,
2371 brw_message_desc(devinfo, msg_length, response_length,
2372 header_present) |
2373 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2374 msg_type, last_render_target,
2375 0 /* send_commit_msg */));
2376 brw_inst_set_eot(devinfo, insn, eot);
2377
2378 return insn;
2379 }
2380
2381 brw_inst *
2382 gen9_fb_READ(struct brw_codegen *p,
2383 struct brw_reg dst,
2384 struct brw_reg payload,
2385 unsigned binding_table_index,
2386 unsigned msg_length,
2387 unsigned response_length,
2388 bool per_sample)
2389 {
2390 const struct gen_device_info *devinfo = p->devinfo;
2391 assert(devinfo->gen >= 9);
2392 const unsigned msg_subtype =
2393 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2394 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2395
2396 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2397 brw_set_dest(p, insn, dst);
2398 brw_set_src0(p, insn, payload);
2399 brw_set_desc(
2400 p, insn,
2401 brw_message_desc(devinfo, msg_length, response_length, true) |
2402 brw_dp_read_desc(devinfo, binding_table_index,
2403 per_sample << 5 | msg_subtype,
2404 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2405 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2406 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2407
2408 return insn;
2409 }
2410
2411 /**
2412 * Texture sample instruction.
2413 * Note: the msg_type plus msg_length values determine exactly what kind
2414 * of sampling operation is performed. See volume 4, page 161 of docs.
2415 */
2416 void brw_SAMPLE(struct brw_codegen *p,
2417 struct brw_reg dest,
2418 unsigned msg_reg_nr,
2419 struct brw_reg src0,
2420 unsigned binding_table_index,
2421 unsigned sampler,
2422 unsigned msg_type,
2423 unsigned response_length,
2424 unsigned msg_length,
2425 unsigned header_present,
2426 unsigned simd_mode,
2427 unsigned return_format)
2428 {
2429 const struct gen_device_info *devinfo = p->devinfo;
2430 brw_inst *insn;
2431
2432 if (msg_reg_nr != -1)
2433 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2434
2435 insn = next_insn(p, BRW_OPCODE_SEND);
2436 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2437 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2438
2439 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2440 *
2441 * "Instruction compression is not allowed for this instruction (that
2442 * is, send). The hardware behavior is undefined if this instruction is
2443 * set as compressed. However, compress control can be set to "SecHalf"
2444 * to affect the EMask generation."
2445 *
2446 * No similar wording is found in later PRMs, but there are examples
2447 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2448 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2449 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2450 */
2451 brw_inst_set_compression(devinfo, insn, false);
2452
2453 if (devinfo->gen < 6)
2454 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2455
2456 brw_set_dest(p, insn, dest);
2457 brw_set_src0(p, insn, src0);
2458 brw_set_desc(p, insn,
2459 brw_message_desc(devinfo, msg_length, response_length,
2460 header_present) |
2461 brw_sampler_desc(devinfo, binding_table_index, sampler,
2462 msg_type, simd_mode, return_format));
2463 }
2464
2465 /* Adjust the message header's sampler state pointer to
2466 * select the correct group of 16 samplers.
2467 */
2468 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2469 struct brw_reg header,
2470 struct brw_reg sampler_index)
2471 {
2472 /* The "Sampler Index" field can only store values between 0 and 15.
2473 * However, we can add an offset to the "Sampler State Pointer"
2474 * field, effectively selecting a different set of 16 samplers.
2475 *
2476 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2477 * offset, and each sampler state is only 16-bytes, so we can't
2478 * exclusively use the offset - we have to use both.
2479 */
2480
2481 const struct gen_device_info *devinfo = p->devinfo;
2482
2483 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2484 const int sampler_state_size = 16; /* 16 bytes */
2485 uint32_t sampler = sampler_index.ud;
2486
2487 if (sampler >= 16) {
2488 assert(devinfo->is_haswell || devinfo->gen >= 8);
2489 brw_ADD(p,
2490 get_element_ud(header, 3),
2491 get_element_ud(brw_vec8_grf(0, 0), 3),
2492 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2493 }
2494 } else {
2495 /* Non-const sampler array indexing case */
2496 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2497 return;
2498 }
2499
2500 struct brw_reg temp = get_element_ud(header, 3);
2501
2502 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2503 brw_SHL(p, temp, temp, brw_imm_ud(4));
2504 brw_ADD(p,
2505 get_element_ud(header, 3),
2506 get_element_ud(brw_vec8_grf(0, 0), 3),
2507 temp);
2508 }
2509 }
2510
2511 /* All these variables are pretty confusing - we might be better off
2512 * using bitmasks and macros for this, in the old style. Or perhaps
2513 * just having the caller instantiate the fields in dword3 itself.
2514 */
2515 void brw_urb_WRITE(struct brw_codegen *p,
2516 struct brw_reg dest,
2517 unsigned msg_reg_nr,
2518 struct brw_reg src0,
2519 enum brw_urb_write_flags flags,
2520 unsigned msg_length,
2521 unsigned response_length,
2522 unsigned offset,
2523 unsigned swizzle)
2524 {
2525 const struct gen_device_info *devinfo = p->devinfo;
2526 brw_inst *insn;
2527
2528 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2529
2530 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2531 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2532 brw_push_insn_state(p);
2533 brw_set_default_access_mode(p, BRW_ALIGN_1);
2534 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2535 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2536 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2537 BRW_REGISTER_TYPE_UD),
2538 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2539 brw_imm_ud(0xff00));
2540 brw_pop_insn_state(p);
2541 }
2542
2543 insn = next_insn(p, BRW_OPCODE_SEND);
2544
2545 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2546
2547 brw_set_dest(p, insn, dest);
2548 brw_set_src0(p, insn, src0);
2549 brw_set_src1(p, insn, brw_imm_d(0));
2550
2551 if (devinfo->gen < 6)
2552 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2553
2554 brw_set_urb_message(p,
2555 insn,
2556 flags,
2557 msg_length,
2558 response_length,
2559 offset,
2560 swizzle);
2561 }
2562
2563 void
2564 brw_send_indirect_message(struct brw_codegen *p,
2565 unsigned sfid,
2566 struct brw_reg dst,
2567 struct brw_reg payload,
2568 struct brw_reg desc,
2569 unsigned desc_imm,
2570 bool eot)
2571 {
2572 const struct gen_device_info *devinfo = p->devinfo;
2573 struct brw_inst *send;
2574
2575 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2576
2577 assert(desc.type == BRW_REGISTER_TYPE_UD);
2578
2579 if (desc.file == BRW_IMMEDIATE_VALUE) {
2580 send = next_insn(p, BRW_OPCODE_SEND);
2581 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2582 brw_set_desc(p, send, desc.ud | desc_imm);
2583 } else {
2584 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2585
2586 brw_push_insn_state(p);
2587 brw_set_default_access_mode(p, BRW_ALIGN_1);
2588 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2589 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2590 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2591
2592 /* Load the indirect descriptor to an address register using OR so the
2593 * caller can specify additional descriptor bits with the desc_imm
2594 * immediate.
2595 */
2596 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2597
2598 brw_pop_insn_state(p);
2599
2600 send = next_insn(p, BRW_OPCODE_SEND);
2601 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2602
2603 if (devinfo->gen >= 12)
2604 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2605 else
2606 brw_set_src1(p, send, addr);
2607 }
2608
2609 brw_set_dest(p, send, dst);
2610 brw_inst_set_sfid(devinfo, send, sfid);
2611 brw_inst_set_eot(devinfo, send, eot);
2612 }
2613
2614 void
2615 brw_send_indirect_split_message(struct brw_codegen *p,
2616 unsigned sfid,
2617 struct brw_reg dst,
2618 struct brw_reg payload0,
2619 struct brw_reg payload1,
2620 struct brw_reg desc,
2621 unsigned desc_imm,
2622 struct brw_reg ex_desc,
2623 unsigned ex_desc_imm,
2624 bool eot)
2625 {
2626 const struct gen_device_info *devinfo = p->devinfo;
2627 struct brw_inst *send;
2628
2629 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2630
2631 assert(desc.type == BRW_REGISTER_TYPE_UD);
2632
2633 if (desc.file == BRW_IMMEDIATE_VALUE) {
2634 desc.ud |= desc_imm;
2635 } else {
2636 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2637
2638 brw_push_insn_state(p);
2639 brw_set_default_access_mode(p, BRW_ALIGN_1);
2640 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2641 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2642 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2643
2644 /* Load the indirect descriptor to an address register using OR so the
2645 * caller can specify additional descriptor bits with the desc_imm
2646 * immediate.
2647 */
2648 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2649
2650 brw_pop_insn_state(p);
2651 desc = addr;
2652 }
2653
2654 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2655 (ex_desc.ud & INTEL_MASK(15, 12)) == 0) {
2656 ex_desc.ud |= ex_desc_imm;
2657 } else {
2658 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2659
2660 brw_push_insn_state(p);
2661 brw_set_default_access_mode(p, BRW_ALIGN_1);
2662 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2663 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2664 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2665
2666 /* Load the indirect extended descriptor to an address register using OR
2667 * so the caller can specify additional descriptor bits with the
2668 * desc_imm immediate.
2669 *
2670 * Even though the instruction dispatcher always pulls the SFID and EOT
2671 * fields from the instruction itself, actual external unit which
2672 * processes the message gets the SFID and EOT from the extended
2673 * descriptor which comes from the address register. If we don't OR
2674 * those two bits in, the external unit may get confused and hang.
2675 */
2676 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2677
2678 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2679 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2680 * we may have fallen back to an indirect extended descriptor.
2681 */
2682 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2683 } else {
2684 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2685 }
2686
2687 brw_pop_insn_state(p);
2688 ex_desc = addr;
2689 }
2690
2691 send = next_insn(p, BRW_OPCODE_SENDS);
2692 brw_set_dest(p, send, dst);
2693 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2694 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2695
2696 if (desc.file == BRW_IMMEDIATE_VALUE) {
2697 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2698 brw_inst_set_send_desc(devinfo, send, desc.ud);
2699 } else {
2700 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2701 assert(desc.nr == BRW_ARF_ADDRESS);
2702 assert(desc.subnr == 0);
2703 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2704 }
2705
2706 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2707 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2708 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2709 } else {
2710 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2711 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2712 assert((ex_desc.subnr & 0x3) == 0);
2713 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2714 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2715 }
2716
2717 brw_inst_set_sfid(devinfo, send, sfid);
2718 brw_inst_set_eot(devinfo, send, eot);
2719 }
2720
2721 static void
2722 brw_send_indirect_surface_message(struct brw_codegen *p,
2723 unsigned sfid,
2724 struct brw_reg dst,
2725 struct brw_reg payload,
2726 struct brw_reg surface,
2727 unsigned desc_imm)
2728 {
2729 if (surface.file != BRW_IMMEDIATE_VALUE) {
2730 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2731
2732 brw_push_insn_state(p);
2733 brw_set_default_access_mode(p, BRW_ALIGN_1);
2734 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2735 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2736 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2737
2738 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2739 * some surface array is accessed out of bounds.
2740 */
2741 brw_AND(p, addr,
2742 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2743 BRW_GET_SWZ(surface.swizzle, 0)),
2744 brw_imm_ud(0xff));
2745
2746 brw_pop_insn_state(p);
2747
2748 surface = addr;
2749 }
2750
2751 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2752 }
2753
2754 static bool
2755 while_jumps_before_offset(const struct gen_device_info *devinfo,
2756 brw_inst *insn, int while_offset, int start_offset)
2757 {
2758 int scale = 16 / brw_jump_scale(devinfo);
2759 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2760 : brw_inst_jip(devinfo, insn);
2761 assert(jip < 0);
2762 return while_offset + jip * scale <= start_offset;
2763 }
2764
2765
2766 static int
2767 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2768 {
2769 int offset;
2770 void *store = p->store;
2771 const struct gen_device_info *devinfo = p->devinfo;
2772
2773 int depth = 0;
2774
2775 for (offset = next_offset(devinfo, store, start_offset);
2776 offset < p->next_insn_offset;
2777 offset = next_offset(devinfo, store, offset)) {
2778 brw_inst *insn = store + offset;
2779
2780 switch (brw_inst_opcode(devinfo, insn)) {
2781 case BRW_OPCODE_IF:
2782 depth++;
2783 break;
2784 case BRW_OPCODE_ENDIF:
2785 if (depth == 0)
2786 return offset;
2787 depth--;
2788 break;
2789 case BRW_OPCODE_WHILE:
2790 /* If the while doesn't jump before our instruction, it's the end
2791 * of a sibling do...while loop. Ignore it.
2792 */
2793 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2794 continue;
2795 /* fallthrough */
2796 case BRW_OPCODE_ELSE:
2797 case BRW_OPCODE_HALT:
2798 if (depth == 0)
2799 return offset;
2800 default:
2801 break;
2802 }
2803 }
2804
2805 return 0;
2806 }
2807
2808 /* There is no DO instruction on gen6, so to find the end of the loop
2809 * we have to see if the loop is jumping back before our start
2810 * instruction.
2811 */
2812 static int
2813 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2814 {
2815 const struct gen_device_info *devinfo = p->devinfo;
2816 int offset;
2817 void *store = p->store;
2818
2819 assert(devinfo->gen >= 6);
2820
2821 /* Always start after the instruction (such as a WHILE) we're trying to fix
2822 * up.
2823 */
2824 for (offset = next_offset(devinfo, store, start_offset);
2825 offset < p->next_insn_offset;
2826 offset = next_offset(devinfo, store, offset)) {
2827 brw_inst *insn = store + offset;
2828
2829 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2830 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2831 return offset;
2832 }
2833 }
2834 assert(!"not reached");
2835 return start_offset;
2836 }
2837
2838 /* After program generation, go back and update the UIP and JIP of
2839 * BREAK, CONT, and HALT instructions to their correct locations.
2840 */
2841 void
2842 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2843 {
2844 const struct gen_device_info *devinfo = p->devinfo;
2845 int offset;
2846 int br = brw_jump_scale(devinfo);
2847 int scale = 16 / br;
2848 void *store = p->store;
2849
2850 if (devinfo->gen < 6)
2851 return;
2852
2853 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2854 brw_inst *insn = store + offset;
2855 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2856
2857 int block_end_offset = brw_find_next_block_end(p, offset);
2858 switch (brw_inst_opcode(devinfo, insn)) {
2859 case BRW_OPCODE_BREAK:
2860 assert(block_end_offset != 0);
2861 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2862 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2863 brw_inst_set_uip(devinfo, insn,
2864 (brw_find_loop_end(p, offset) - offset +
2865 (devinfo->gen == 6 ? 16 : 0)) / scale);
2866 break;
2867 case BRW_OPCODE_CONTINUE:
2868 assert(block_end_offset != 0);
2869 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2870 brw_inst_set_uip(devinfo, insn,
2871 (brw_find_loop_end(p, offset) - offset) / scale);
2872
2873 assert(brw_inst_uip(devinfo, insn) != 0);
2874 assert(brw_inst_jip(devinfo, insn) != 0);
2875 break;
2876
2877 case BRW_OPCODE_ENDIF: {
2878 int32_t jump = (block_end_offset == 0) ?
2879 1 * br : (block_end_offset - offset) / scale;
2880 if (devinfo->gen >= 7)
2881 brw_inst_set_jip(devinfo, insn, jump);
2882 else
2883 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2884 break;
2885 }
2886
2887 case BRW_OPCODE_HALT:
2888 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2889 *
2890 * "In case of the halt instruction not inside any conditional
2891 * code block, the value of <JIP> and <UIP> should be the
2892 * same. In case of the halt instruction inside conditional code
2893 * block, the <UIP> should be the end of the program, and the
2894 * <JIP> should be end of the most inner conditional code block."
2895 *
2896 * The uip will have already been set by whoever set up the
2897 * instruction.
2898 */
2899 if (block_end_offset == 0) {
2900 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2901 } else {
2902 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2903 }
2904 assert(brw_inst_uip(devinfo, insn) != 0);
2905 assert(brw_inst_jip(devinfo, insn) != 0);
2906 break;
2907
2908 default:
2909 break;
2910 }
2911 }
2912 }
2913
2914 void brw_ff_sync(struct brw_codegen *p,
2915 struct brw_reg dest,
2916 unsigned msg_reg_nr,
2917 struct brw_reg src0,
2918 bool allocate,
2919 unsigned response_length,
2920 bool eot)
2921 {
2922 const struct gen_device_info *devinfo = p->devinfo;
2923 brw_inst *insn;
2924
2925 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2926
2927 insn = next_insn(p, BRW_OPCODE_SEND);
2928 brw_set_dest(p, insn, dest);
2929 brw_set_src0(p, insn, src0);
2930 brw_set_src1(p, insn, brw_imm_d(0));
2931
2932 if (devinfo->gen < 6)
2933 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2934
2935 brw_set_ff_sync_message(p,
2936 insn,
2937 allocate,
2938 response_length,
2939 eot);
2940 }
2941
2942 /**
2943 * Emit the SEND instruction necessary to generate stream output data on Gen6
2944 * (for transform feedback).
2945 *
2946 * If send_commit_msg is true, this is the last piece of stream output data
2947 * from this thread, so send the data as a committed write. According to the
2948 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2949 *
2950 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2951 * writes are complete by sending the final write as a committed write."
2952 */
2953 void
2954 brw_svb_write(struct brw_codegen *p,
2955 struct brw_reg dest,
2956 unsigned msg_reg_nr,
2957 struct brw_reg src0,
2958 unsigned binding_table_index,
2959 bool send_commit_msg)
2960 {
2961 const struct gen_device_info *devinfo = p->devinfo;
2962 const unsigned target_cache =
2963 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2964 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2965 BRW_SFID_DATAPORT_WRITE);
2966 brw_inst *insn;
2967
2968 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2969
2970 insn = next_insn(p, BRW_OPCODE_SEND);
2971 brw_inst_set_sfid(devinfo, insn, target_cache);
2972 brw_set_dest(p, insn, dest);
2973 brw_set_src0(p, insn, src0);
2974 brw_set_desc(p, insn,
2975 brw_message_desc(devinfo, 1, send_commit_msg, true) |
2976 brw_dp_write_desc(devinfo, binding_table_index,
2977 0, /* msg_control: ignored */
2978 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2979 0, /* last_render_target: ignored */
2980 send_commit_msg)); /* send_commit_msg */
2981 }
2982
2983 static unsigned
2984 brw_surface_payload_size(struct brw_codegen *p,
2985 unsigned num_channels,
2986 unsigned exec_size /**< 0 for SIMD4x2 */)
2987 {
2988 if (exec_size == 0)
2989 return 1; /* SIMD4x2 */
2990 else if (exec_size <= 8)
2991 return num_channels;
2992 else
2993 return 2 * num_channels;
2994 }
2995
2996 void
2997 brw_untyped_atomic(struct brw_codegen *p,
2998 struct brw_reg dst,
2999 struct brw_reg payload,
3000 struct brw_reg surface,
3001 unsigned atomic_op,
3002 unsigned msg_length,
3003 bool response_expected,
3004 bool header_present)
3005 {
3006 const struct gen_device_info *devinfo = p->devinfo;
3007 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3008 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3009 GEN7_SFID_DATAPORT_DATA_CACHE);
3010 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3011 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3012 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3013 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3014 has_simd4x2 ? 0 : 8;
3015 const unsigned response_length =
3016 brw_surface_payload_size(p, response_expected, exec_size);
3017 const unsigned desc =
3018 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3019 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3020 response_expected);
3021 /* Mask out unused components -- This is especially important in Align16
3022 * mode on generations that don't have native support for SIMD4x2 atomics,
3023 * because unused but enabled components will cause the dataport to perform
3024 * additional atomic operations on the addresses that happen to be in the
3025 * uninitialized Y, Z and W coordinates of the payload.
3026 */
3027 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3028
3029 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3030 payload, surface, desc);
3031 }
3032
3033 void
3034 brw_untyped_surface_read(struct brw_codegen *p,
3035 struct brw_reg dst,
3036 struct brw_reg payload,
3037 struct brw_reg surface,
3038 unsigned msg_length,
3039 unsigned num_channels)
3040 {
3041 const struct gen_device_info *devinfo = p->devinfo;
3042 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3043 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3044 GEN7_SFID_DATAPORT_DATA_CACHE);
3045 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3046 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3047 const unsigned response_length =
3048 brw_surface_payload_size(p, num_channels, exec_size);
3049 const unsigned desc =
3050 brw_message_desc(devinfo, msg_length, response_length, false) |
3051 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3052
3053 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3054 }
3055
3056 void
3057 brw_untyped_surface_write(struct brw_codegen *p,
3058 struct brw_reg payload,
3059 struct brw_reg surface,
3060 unsigned msg_length,
3061 unsigned num_channels,
3062 bool header_present)
3063 {
3064 const struct gen_device_info *devinfo = p->devinfo;
3065 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3066 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3067 GEN7_SFID_DATAPORT_DATA_CACHE);
3068 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3069 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3070 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3071 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3072 has_simd4x2 ? 0 : 8;
3073 const unsigned desc =
3074 brw_message_desc(devinfo, msg_length, 0, header_present) |
3075 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3076 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3077 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3078
3079 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3080 payload, surface, desc);
3081 }
3082
3083 static void
3084 brw_set_memory_fence_message(struct brw_codegen *p,
3085 struct brw_inst *insn,
3086 enum brw_message_target sfid,
3087 bool commit_enable,
3088 unsigned bti)
3089 {
3090 const struct gen_device_info *devinfo = p->devinfo;
3091
3092 brw_set_desc(p, insn, brw_message_desc(
3093 devinfo, 1, (commit_enable ? 1 : 0), true));
3094
3095 brw_inst_set_sfid(devinfo, insn, sfid);
3096
3097 switch (sfid) {
3098 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3099 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3100 break;
3101 case GEN7_SFID_DATAPORT_DATA_CACHE:
3102 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3103 break;
3104 default:
3105 unreachable("Not reached");
3106 }
3107
3108 if (commit_enable)
3109 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3110
3111 assert(devinfo->gen >= 11 || bti == 0);
3112 brw_inst_set_binding_table_index(devinfo, insn, bti);
3113 }
3114
3115 void
3116 brw_memory_fence(struct brw_codegen *p,
3117 struct brw_reg dst,
3118 struct brw_reg src,
3119 enum opcode send_op,
3120 bool stall,
3121 unsigned bti)
3122 {
3123 const struct gen_device_info *devinfo = p->devinfo;
3124 const bool commit_enable = stall ||
3125 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3126 (devinfo->gen == 7 && !devinfo->is_haswell);
3127 struct brw_inst *insn;
3128
3129 brw_push_insn_state(p);
3130 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3131 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3132 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3133 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3134
3135 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3136 * message doesn't write anything back.
3137 */
3138 insn = next_insn(p, send_op);
3139 brw_set_dest(p, insn, dst);
3140 brw_set_src0(p, insn, src);
3141 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3142 commit_enable, bti);
3143
3144 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3145 /* IVB does typed surface access through the render cache, so we need to
3146 * flush it too. Use a different register so both flushes can be
3147 * pipelined by the hardware.
3148 */
3149 insn = next_insn(p, send_op);
3150 brw_set_dest(p, insn, offset(dst, 1));
3151 brw_set_src0(p, insn, src);
3152 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3153 commit_enable, bti);
3154
3155 /* Now write the response of the second message into the response of the
3156 * first to trigger a pipeline stall -- This way future render and data
3157 * cache messages will be properly ordered with respect to past data and
3158 * render cache messages.
3159 */
3160 brw_MOV(p, dst, offset(dst, 1));
3161 }
3162
3163 if (stall)
3164 brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
3165
3166 brw_pop_insn_state(p);
3167 }
3168
3169 void
3170 brw_pixel_interpolator_query(struct brw_codegen *p,
3171 struct brw_reg dest,
3172 struct brw_reg mrf,
3173 bool noperspective,
3174 unsigned mode,
3175 struct brw_reg data,
3176 unsigned msg_length,
3177 unsigned response_length)
3178 {
3179 const struct gen_device_info *devinfo = p->devinfo;
3180 const uint16_t exec_size = brw_get_default_exec_size(p);
3181 const unsigned slot_group = brw_get_default_group(p) / 16;
3182 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3183 const unsigned desc =
3184 brw_message_desc(devinfo, msg_length, response_length, false) |
3185 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3186 slot_group);
3187
3188 /* brw_send_indirect_message will automatically use a direct send message
3189 * if data is actually immediate.
3190 */
3191 brw_send_indirect_message(p,
3192 GEN7_SFID_PIXEL_INTERPOLATOR,
3193 dest,
3194 mrf,
3195 vec1(data),
3196 desc,
3197 false);
3198 }
3199
3200 void
3201 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3202 struct brw_reg mask)
3203 {
3204 const struct gen_device_info *devinfo = p->devinfo;
3205 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3206 const unsigned qtr_control = brw_get_default_group(p) / 8;
3207 brw_inst *inst;
3208
3209 assert(devinfo->gen >= 7);
3210 assert(mask.type == BRW_REGISTER_TYPE_UD);
3211
3212 brw_push_insn_state(p);
3213
3214 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3215 * unnecessary bits in the instruction words, get the information we need
3216 * and reset the default flag register. This allows more instructions to be
3217 * compacted.
3218 */
3219 const unsigned flag_subreg = p->current->flag_subreg;
3220 brw_set_default_flag_reg(p, 0, 0);
3221
3222 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3223 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3224
3225 if (devinfo->gen >= 8) {
3226 /* Getting the first active channel index is easy on Gen8: Just find
3227 * the first bit set in the execution mask. The register exists on
3228 * HSW already but it reads back as all ones when the current
3229 * instruction has execution masking disabled, so it's kind of
3230 * useless.
3231 */
3232 struct brw_reg exec_mask =
3233 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3234
3235 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3236 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3237 /* Unfortunately, ce0 does not take into account the thread
3238 * dispatch mask, which may be a problem in cases where it's not
3239 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3240 * some n). Combine ce0 with the given dispatch (or vector) mask
3241 * to mask off those channels which were never dispatched by the
3242 * hardware.
3243 */
3244 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3245 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3246 exec_mask = vec1(dst);
3247 }
3248
3249 /* Quarter control has the effect of magically shifting the value of
3250 * ce0 so you'll get the first active channel relative to the
3251 * specified quarter control as result.
3252 */
3253 inst = brw_FBL(p, vec1(dst), exec_mask);
3254 } else {
3255 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3256
3257 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3258 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3259
3260 /* Run enough instructions returning zero with execution masking and
3261 * a conditional modifier enabled in order to get the full execution
3262 * mask in f1.0. We could use a single 32-wide move here if it
3263 * weren't because of the hardware bug that causes channel enables to
3264 * be applied incorrectly to the second half of 32-wide instructions
3265 * on Gen7.
3266 */
3267 const unsigned lower_size = MIN2(16, exec_size);
3268 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3269 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3270 brw_imm_uw(0));
3271 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3272 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3273 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3274 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3275 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3276 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3277 }
3278
3279 /* Find the first bit set in the exec_size-wide portion of the flag
3280 * register that was updated by the last sequence of MOV
3281 * instructions.
3282 */
3283 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3284 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3285 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3286 }
3287 } else {
3288 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3289
3290 if (devinfo->gen >= 8 &&
3291 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3292 /* In SIMD4x2 mode the first active channel index is just the
3293 * negation of the first bit of the mask register. Note that ce0
3294 * doesn't take into account the dispatch mask, so the Gen7 path
3295 * should be used instead unless you have the guarantee that the
3296 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3297 * for some n).
3298 */
3299 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3300 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3301 brw_imm_ud(1));
3302
3303 } else {
3304 /* Overwrite the destination without and with execution masking to
3305 * find out which of the channels is active.
3306 */
3307 brw_push_insn_state(p);
3308 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3309 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3310 brw_imm_ud(1));
3311
3312 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3313 brw_imm_ud(0));
3314 brw_pop_insn_state(p);
3315 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3316 }
3317 }
3318
3319 brw_pop_insn_state(p);
3320 }
3321
3322 void
3323 brw_broadcast(struct brw_codegen *p,
3324 struct brw_reg dst,
3325 struct brw_reg src,
3326 struct brw_reg idx)
3327 {
3328 const struct gen_device_info *devinfo = p->devinfo;
3329 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3330 brw_inst *inst;
3331
3332 brw_push_insn_state(p);
3333 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3334 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3335
3336 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3337 src.address_mode == BRW_ADDRESS_DIRECT);
3338 assert(!src.abs && !src.negate);
3339 assert(src.type == dst.type);
3340
3341 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3342 idx.file == BRW_IMMEDIATE_VALUE) {
3343 /* Trivial, the source is already uniform or the index is a constant.
3344 * We will typically not get here if the optimizer is doing its job, but
3345 * asserting would be mean.
3346 */
3347 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3348 brw_MOV(p, dst,
3349 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3350 stride(suboffset(src, 4 * i), 0, 4, 1)));
3351 } else {
3352 /* From the Haswell PRM section "Register Region Restrictions":
3353 *
3354 * "The lower bits of the AddressImmediate must not overflow to
3355 * change the register address. The lower 5 bits of Address
3356 * Immediate when added to lower 5 bits of address register gives
3357 * the sub-register offset. The upper bits of Address Immediate
3358 * when added to upper bits of address register gives the register
3359 * address. Any overflow from sub-register offset is dropped."
3360 *
3361 * Fortunately, for broadcast, we never have a sub-register offset so
3362 * this isn't an issue.
3363 */
3364 assert(src.subnr == 0);
3365
3366 if (align1) {
3367 const struct brw_reg addr =
3368 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3369 unsigned offset = src.nr * REG_SIZE + src.subnr;
3370 /* Limit in bytes of the signed indirect addressing immediate. */
3371 const unsigned limit = 512;
3372
3373 brw_push_insn_state(p);
3374 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3375 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3376
3377 /* Take into account the component size and horizontal stride. */
3378 assert(src.vstride == src.hstride + src.width);
3379 brw_SHL(p, addr, vec1(idx),
3380 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3381 src.hstride - 1));
3382
3383 /* We can only address up to limit bytes using the indirect
3384 * addressing immediate, account for the difference if the source
3385 * register is above this limit.
3386 */
3387 if (offset >= limit) {
3388 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3389 offset = offset % limit;
3390 }
3391
3392 brw_pop_insn_state(p);
3393
3394 /* Use indirect addressing to fetch the specified component. */
3395 if (type_sz(src.type) > 4 &&
3396 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3397 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3398 *
3399 * "When source or destination datatype is 64b or operation is
3400 * integer DWord multiply, indirect addressing must not be
3401 * used."
3402 *
3403 * To work around both of this issue, we do two integer MOVs
3404 * insead of one 64-bit MOV. Because no double value should ever
3405 * cross a register boundary, it's safe to use the immediate
3406 * offset in the indirect here to handle adding 4 bytes to the
3407 * offset and avoid the extra ADD to the register file.
3408 */
3409 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3410 retype(brw_vec1_indirect(addr.subnr, offset),
3411 BRW_REGISTER_TYPE_D));
3412 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3413 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3414 BRW_REGISTER_TYPE_D));
3415 } else {
3416 brw_MOV(p, dst,
3417 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3418 }
3419 } else {
3420 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3421 * to all bits of a flag register,
3422 */
3423 inst = brw_MOV(p,
3424 brw_null_reg(),
3425 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3426 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3427 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3428 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3429
3430 /* and use predicated SEL to pick the right channel. */
3431 inst = brw_SEL(p, dst,
3432 stride(suboffset(src, 4), 4, 4, 1),
3433 stride(src, 4, 4, 1));
3434 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3435 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3436 }
3437 }
3438
3439 brw_pop_insn_state(p);
3440 }
3441
3442 /**
3443 * This instruction is generated as a single-channel align1 instruction by
3444 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3445 *
3446 * We can't use the typed atomic op in the FS because that has the execution
3447 * mask ANDed with the pixel mask, but we just want to write the one dword for
3448 * all the pixels.
3449 *
3450 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3451 * one u32. So we use the same untyped atomic write message as the pixel
3452 * shader.
3453 *
3454 * The untyped atomic operation requires a BUFFER surface type with RAW
3455 * format, and is only accessible through the legacy DATA_CACHE dataport
3456 * messages.
3457 */
3458 void brw_shader_time_add(struct brw_codegen *p,
3459 struct brw_reg payload,
3460 uint32_t surf_index)
3461 {
3462 const struct gen_device_info *devinfo = p->devinfo;
3463 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3464 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3465 GEN7_SFID_DATAPORT_DATA_CACHE);
3466 assert(devinfo->gen >= 7);
3467
3468 brw_push_insn_state(p);
3469 brw_set_default_access_mode(p, BRW_ALIGN_1);
3470 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3471 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3472 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3473
3474 /* We use brw_vec1_reg and unmasked because we want to increment the given
3475 * offset only once.
3476 */
3477 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3478 BRW_ARF_NULL, 0));
3479 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3480 payload.nr, 0));
3481 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3482 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3483 false)));
3484
3485 brw_inst_set_sfid(devinfo, send, sfid);
3486 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3487
3488 brw_pop_insn_state(p);
3489 }
3490
3491
3492 /**
3493 * Emit the SEND message for a barrier
3494 */
3495 void
3496 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3497 {
3498 const struct gen_device_info *devinfo = p->devinfo;
3499 struct brw_inst *inst;
3500
3501 assert(devinfo->gen >= 7);
3502
3503 brw_push_insn_state(p);
3504 brw_set_default_access_mode(p, BRW_ALIGN_1);
3505 inst = next_insn(p, BRW_OPCODE_SEND);
3506 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3507 brw_set_src0(p, inst, src);
3508 brw_set_src1(p, inst, brw_null_reg());
3509 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3510
3511 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3512 brw_inst_set_gateway_notify(devinfo, inst, 1);
3513 brw_inst_set_gateway_subfuncid(devinfo, inst,
3514 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3515
3516 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3517 brw_pop_insn_state(p);
3518 }
3519
3520
3521 /**
3522 * Emit the wait instruction for a barrier
3523 */
3524 void
3525 brw_WAIT(struct brw_codegen *p)
3526 {
3527 const struct gen_device_info *devinfo = p->devinfo;
3528 struct brw_inst *insn;
3529
3530 struct brw_reg src = brw_notification_reg();
3531
3532 insn = next_insn(p, BRW_OPCODE_WAIT);
3533 brw_set_dest(p, insn, src);
3534 brw_set_src0(p, insn, src);
3535 brw_set_src1(p, insn, brw_null_reg());
3536
3537 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3538 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3539 }
3540
3541 void
3542 brw_float_controls_mode(struct brw_codegen *p,
3543 unsigned mode, unsigned mask)
3544 {
3545 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3546 brw_imm_ud(~mask));
3547 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3548
3549 /* From the Skylake PRM, Volume 7, page 760:
3550 * "Implementation Restriction on Register Access: When the control
3551 * register is used as an explicit source and/or destination, hardware
3552 * does not ensure execution pipeline coherency. Software must set the
3553 * thread control field to ‘switch’ for an instruction that uses
3554 * control register as an explicit operand."
3555 */
3556 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3557
3558 if (mode) {
3559 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3560 brw_imm_ud(mode));
3561 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3562 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3563 }
3564 }