intel/eu/gen12: Codegen pathological SEND source and destination regions.
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 /* The hardware has a restriction where if the destination is Byte,
98 * the instruction needs to have a stride of 2 (except for packed byte
99 * MOV). This seems to be required even if the destination is the NULL
100 * register.
101 */
102 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
103 dest.nr == BRW_ARF_NULL &&
104 type_sz(dest.type) == 1) {
105 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
106 }
107
108 gen7_convert_mrf_to_grf(p, &dest);
109
110 if (devinfo->gen >= 12 &&
111 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
112 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
113 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
115 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
116 assert(dest.subnr == 0);
117 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
118 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
119 dest.vstride == dest.width + 1));
120 assert(!dest.negate && !dest.abs);
121 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
122 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
123
124 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
125 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
126 assert(devinfo->gen < 12);
127 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
128 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
129 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
130 assert(dest.subnr % 16 == 0);
131 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
132 dest.vstride == dest.width + 1);
133 assert(!dest.negate && !dest.abs);
134 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
135 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
136 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
137 } else {
138 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
139 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
140
141 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
142 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
143
144 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
145 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
146 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
147 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
148 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
149 } else {
150 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
151 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
152 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
153 dest.file == BRW_MESSAGE_REGISTER_FILE) {
154 assert(dest.writemask != 0);
155 }
156 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
157 * Although Dst.HorzStride is a don't care for Align16, HW needs
158 * this to be programmed as "01".
159 */
160 brw_inst_set_dst_hstride(devinfo, inst, 1);
161 }
162 } else {
163 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
164
165 /* These are different sizes in align1 vs align16:
166 */
167 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
168 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
169 dest.indirect_offset);
170 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
171 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
172 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
173 } else {
174 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
175 dest.indirect_offset);
176 /* even ignored in da16, still need to set as '01' */
177 brw_inst_set_dst_hstride(devinfo, inst, 1);
178 }
179 }
180 }
181
182 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
183 * or 16 (SIMD16), as that's normally correct. However, when dealing with
184 * small registers, it can be useful for us to automatically reduce it to
185 * match the register size.
186 */
187 if (p->automatic_exec_sizes) {
188 /*
189 * In platforms that support fp64 we can emit instructions with a width
190 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
191 * these cases we need to make sure that these instructions have their
192 * exec sizes set properly when they are emitted and we can't rely on
193 * this code to fix it.
194 */
195 bool fix_exec_size;
196 if (devinfo->gen >= 6)
197 fix_exec_size = dest.width < BRW_EXECUTE_4;
198 else
199 fix_exec_size = dest.width < BRW_EXECUTE_8;
200
201 if (fix_exec_size)
202 brw_inst_set_exec_size(devinfo, inst, dest.width);
203 }
204 }
205
206 void
207 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
208 {
209 const struct gen_device_info *devinfo = p->devinfo;
210
211 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
212 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
213 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
214 assert(reg.nr < 128);
215
216 gen7_convert_mrf_to_grf(p, &reg);
217
218 if (devinfo->gen >= 6 &&
219 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
220 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
221 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
223 /* Any source modifiers or regions will be ignored, since this just
224 * identifies the MRF/GRF to start reading the message contents from.
225 * Check for some likely failures.
226 */
227 assert(!reg.negate);
228 assert(!reg.abs);
229 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
230 }
231
232 if (devinfo->gen >= 12 &&
233 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
234 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
235 assert(reg.file != BRW_IMMEDIATE_VALUE);
236 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
237 assert(reg.subnr == 0);
238 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
239 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
240 reg.vstride == reg.width + 1));
241 assert(!reg.negate && !reg.abs);
242 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
243 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
244
245 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
246 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
247 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
248 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
249 assert(reg.subnr % 16 == 0);
250 assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
251 reg.vstride == reg.width + 1);
252 assert(!reg.negate && !reg.abs);
253 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
254 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
255 } else {
256 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
257 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
258 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
259 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
260
261 if (reg.file == BRW_IMMEDIATE_VALUE) {
262 if (reg.type == BRW_REGISTER_TYPE_DF ||
263 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
264 brw_inst_set_imm_df(devinfo, inst, reg.df);
265 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
266 reg.type == BRW_REGISTER_TYPE_Q)
267 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
268 else
269 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
270
271 if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
272 brw_inst_set_src1_reg_file(devinfo, inst,
273 BRW_ARCHITECTURE_REGISTER_FILE);
274 brw_inst_set_src1_reg_hw_type(devinfo, inst,
275 brw_inst_src0_reg_hw_type(devinfo, inst));
276 }
277 } else {
278 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
279 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
280 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
281 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
282 } else {
283 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
284 }
285 } else {
286 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
287
288 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
289 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
290 } else {
291 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
292 }
293 }
294
295 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
296 if (reg.width == BRW_WIDTH_1 &&
297 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
298 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
299 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
300 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
301 } else {
302 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
303 brw_inst_set_src0_width(devinfo, inst, reg.width);
304 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
305 }
306 } else {
307 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
308 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
309 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
310 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
311 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
312 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
313 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
314 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
315
316 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
317 /* This is an oddity of the fact we're using the same
318 * descriptions for registers in align_16 as align_1:
319 */
320 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
321 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
322 reg.type == BRW_REGISTER_TYPE_DF &&
323 reg.vstride == BRW_VERTICAL_STRIDE_2) {
324 /* From SNB PRM:
325 *
326 * "For Align16 access mode, only encodings of 0000 and 0011
327 * are allowed. Other codes are reserved."
328 *
329 * Presumably the DevSNB behavior applies to IVB as well.
330 */
331 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
332 } else {
333 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
334 }
335 }
336 }
337 }
338 }
339
340
341 void
342 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
343 {
344 const struct gen_device_info *devinfo = p->devinfo;
345
346 if (reg.file == BRW_GENERAL_REGISTER_FILE)
347 assert(reg.nr < 128);
348
349 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
350 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
351 (devinfo->gen >= 12 &&
352 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
353 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
354 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
355 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
356 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
357 assert(reg.subnr == 0);
358 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
359 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
360 reg.vstride == reg.width + 1));
361 assert(!reg.negate && !reg.abs);
362 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
363 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
364 } else {
365 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
366 *
367 * "Accumulator registers may be accessed explicitly as src0
368 * operands only."
369 */
370 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
371 reg.nr != BRW_ARF_ACCUMULATOR);
372
373 gen7_convert_mrf_to_grf(p, &reg);
374 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
375
376 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
377 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
378 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
379
380 /* Only src1 can be immediate in two-argument instructions.
381 */
382 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
383
384 if (reg.file == BRW_IMMEDIATE_VALUE) {
385 /* two-argument instructions can only use 32-bit immediates */
386 assert(type_sz(reg.type) < 8);
387 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
388 } else {
389 /* This is a hardware restriction, which may or may not be lifted
390 * in the future:
391 */
392 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
393 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
394
395 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
396 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
397 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
398 } else {
399 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
400 }
401
402 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
403 if (reg.width == BRW_WIDTH_1 &&
404 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
405 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
406 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
407 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
408 } else {
409 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
410 brw_inst_set_src1_width(devinfo, inst, reg.width);
411 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
412 }
413 } else {
414 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
415 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
416 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
417 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
418 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
419 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
420 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
421 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
422
423 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
424 /* This is an oddity of the fact we're using the same
425 * descriptions for registers in align_16 as align_1:
426 */
427 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
428 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
429 reg.type == BRW_REGISTER_TYPE_DF &&
430 reg.vstride == BRW_VERTICAL_STRIDE_2) {
431 /* From SNB PRM:
432 *
433 * "For Align16 access mode, only encodings of 0000 and 0011
434 * are allowed. Other codes are reserved."
435 *
436 * Presumably the DevSNB behavior applies to IVB as well.
437 */
438 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
439 } else {
440 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
441 }
442 }
443 }
444 }
445 }
446
447 /**
448 * Specify the descriptor and extended descriptor immediate for a SEND(C)
449 * message instruction.
450 */
451 void
452 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
453 unsigned desc, unsigned ex_desc)
454 {
455 const struct gen_device_info *devinfo = p->devinfo;
456 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
457 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
458 brw_inst_set_src1_file_type(devinfo, inst,
459 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
460 brw_inst_set_send_desc(devinfo, inst, desc);
461 if (devinfo->gen >= 9)
462 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
463 }
464
465 static void brw_set_math_message( struct brw_codegen *p,
466 brw_inst *inst,
467 unsigned function,
468 unsigned integer_type,
469 bool low_precision,
470 unsigned dataType )
471 {
472 const struct gen_device_info *devinfo = p->devinfo;
473 unsigned msg_length;
474 unsigned response_length;
475
476 /* Infer message length from the function */
477 switch (function) {
478 case BRW_MATH_FUNCTION_POW:
479 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
480 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
481 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
482 msg_length = 2;
483 break;
484 default:
485 msg_length = 1;
486 break;
487 }
488
489 /* Infer response length from the function */
490 switch (function) {
491 case BRW_MATH_FUNCTION_SINCOS:
492 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
493 response_length = 2;
494 break;
495 default:
496 response_length = 1;
497 break;
498 }
499
500 brw_set_desc(p, inst, brw_message_desc(
501 devinfo, msg_length, response_length, false));
502
503 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
504 brw_inst_set_math_msg_function(devinfo, inst, function);
505 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
506 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
507 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
508 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
509 brw_inst_set_saturate(devinfo, inst, 0);
510 }
511
512
513 static void brw_set_ff_sync_message(struct brw_codegen *p,
514 brw_inst *insn,
515 bool allocate,
516 unsigned response_length,
517 bool end_of_thread)
518 {
519 const struct gen_device_info *devinfo = p->devinfo;
520
521 brw_set_desc(p, insn, brw_message_desc(
522 devinfo, 1, response_length, true));
523
524 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
525 brw_inst_set_eot(devinfo, insn, end_of_thread);
526 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
527 brw_inst_set_urb_allocate(devinfo, insn, allocate);
528 /* The following fields are not used by FF_SYNC: */
529 brw_inst_set_urb_global_offset(devinfo, insn, 0);
530 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
531 brw_inst_set_urb_used(devinfo, insn, 0);
532 brw_inst_set_urb_complete(devinfo, insn, 0);
533 }
534
535 static void brw_set_urb_message( struct brw_codegen *p,
536 brw_inst *insn,
537 enum brw_urb_write_flags flags,
538 unsigned msg_length,
539 unsigned response_length,
540 unsigned offset,
541 unsigned swizzle_control )
542 {
543 const struct gen_device_info *devinfo = p->devinfo;
544
545 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
546 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
547 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
548
549 brw_set_desc(p, insn, brw_message_desc(
550 devinfo, msg_length, response_length, true));
551
552 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
553 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
554
555 if (flags & BRW_URB_WRITE_OWORD) {
556 assert(msg_length == 2); /* header + one OWORD of data */
557 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
558 } else {
559 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
560 }
561
562 brw_inst_set_urb_global_offset(devinfo, insn, offset);
563 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
564
565 if (devinfo->gen < 8) {
566 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
567 }
568
569 if (devinfo->gen < 7) {
570 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
571 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
572 } else {
573 brw_inst_set_urb_per_slot_offset(devinfo, insn,
574 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
575 }
576 }
577
578 static void
579 gen7_set_dp_scratch_message(struct brw_codegen *p,
580 brw_inst *inst,
581 bool write,
582 bool dword,
583 bool invalidate_after_read,
584 unsigned num_regs,
585 unsigned addr_offset,
586 unsigned mlen,
587 unsigned rlen,
588 bool header_present)
589 {
590 const struct gen_device_info *devinfo = p->devinfo;
591 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
592 (devinfo->gen >= 8 && num_regs == 8));
593 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
594 num_regs - 1);
595
596 brw_set_desc(p, inst, brw_message_desc(
597 devinfo, mlen, rlen, header_present));
598
599 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
600 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
601 brw_inst_set_scratch_read_write(devinfo, inst, write);
602 brw_inst_set_scratch_type(devinfo, inst, dword);
603 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
604 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
605 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
606 }
607
608 static void
609 brw_inst_set_state(const struct gen_device_info *devinfo,
610 brw_inst *insn,
611 const struct brw_insn_state *state)
612 {
613 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
614 brw_inst_set_group(devinfo, insn, state->group);
615 brw_inst_set_compression(devinfo, insn, state->compressed);
616 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
617 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
618 brw_inst_set_saturate(devinfo, insn, state->saturate);
619 brw_inst_set_pred_control(devinfo, insn, state->predicate);
620 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
621
622 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
623 state->access_mode == BRW_ALIGN_16) {
624 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
625 if (devinfo->gen >= 7)
626 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
627 } else {
628 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
629 if (devinfo->gen >= 7)
630 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
631 }
632
633 if (devinfo->gen >= 6)
634 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
635 }
636
637 #define next_insn brw_next_insn
638 brw_inst *
639 brw_next_insn(struct brw_codegen *p, unsigned opcode)
640 {
641 const struct gen_device_info *devinfo = p->devinfo;
642 brw_inst *insn;
643
644 if (p->nr_insn + 1 > p->store_size) {
645 p->store_size <<= 1;
646 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
647 }
648
649 p->next_insn_offset += 16;
650 insn = &p->store[p->nr_insn++];
651
652 memset(insn, 0, sizeof(*insn));
653 brw_inst_set_opcode(devinfo, insn, opcode);
654
655 /* Apply the default instruction state */
656 brw_inst_set_state(devinfo, insn, p->current);
657
658 return insn;
659 }
660
661 static brw_inst *
662 brw_alu1(struct brw_codegen *p, unsigned opcode,
663 struct brw_reg dest, struct brw_reg src)
664 {
665 brw_inst *insn = next_insn(p, opcode);
666 brw_set_dest(p, insn, dest);
667 brw_set_src0(p, insn, src);
668 return insn;
669 }
670
671 static brw_inst *
672 brw_alu2(struct brw_codegen *p, unsigned opcode,
673 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
674 {
675 /* 64-bit immediates are only supported on 1-src instructions */
676 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
677 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
678
679 brw_inst *insn = next_insn(p, opcode);
680 brw_set_dest(p, insn, dest);
681 brw_set_src0(p, insn, src0);
682 brw_set_src1(p, insn, src1);
683 return insn;
684 }
685
686 static int
687 get_3src_subreg_nr(struct brw_reg reg)
688 {
689 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
690 * use 32-bit units (components 0..7). Since they only support F/D/UD
691 * types, this doesn't lose any flexibility, but uses fewer bits.
692 */
693 return reg.subnr / 4;
694 }
695
696 static enum gen10_align1_3src_vertical_stride
697 to_3src_align1_vstride(const struct gen_device_info *devinfo,
698 enum brw_vertical_stride vstride)
699 {
700 switch (vstride) {
701 case BRW_VERTICAL_STRIDE_0:
702 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
703 case BRW_VERTICAL_STRIDE_1:
704 assert(devinfo->gen >= 12);
705 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
706 case BRW_VERTICAL_STRIDE_2:
707 assert(devinfo->gen < 12);
708 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
709 case BRW_VERTICAL_STRIDE_4:
710 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
711 case BRW_VERTICAL_STRIDE_8:
712 case BRW_VERTICAL_STRIDE_16:
713 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
714 default:
715 unreachable("invalid vstride");
716 }
717 }
718
719
720 static enum gen10_align1_3src_src_horizontal_stride
721 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
722 {
723 switch (hstride) {
724 case BRW_HORIZONTAL_STRIDE_0:
725 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
726 case BRW_HORIZONTAL_STRIDE_1:
727 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
728 case BRW_HORIZONTAL_STRIDE_2:
729 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
730 case BRW_HORIZONTAL_STRIDE_4:
731 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
732 default:
733 unreachable("invalid hstride");
734 }
735 }
736
737 static brw_inst *
738 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
739 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
740 {
741 const struct gen_device_info *devinfo = p->devinfo;
742 brw_inst *inst = next_insn(p, opcode);
743
744 gen7_convert_mrf_to_grf(p, &dest);
745
746 assert(dest.nr < 128);
747 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
748 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
749 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
750 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
751 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
752 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
753 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
754
755 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
756 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
757 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
758
759 if (devinfo->gen >= 12) {
760 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
761 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
762 } else {
763 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
764 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
765 BRW_ALIGN1_3SRC_ACCUMULATOR);
766 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
767 } else {
768 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
769 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
770 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
771 }
772 }
773 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
774
775 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
776
777 if (brw_reg_type_is_floating_point(dest.type)) {
778 brw_inst_set_3src_a1_exec_type(devinfo, inst,
779 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
780 } else {
781 brw_inst_set_3src_a1_exec_type(devinfo, inst,
782 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
783 }
784
785 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
786 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
787 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
788 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
789
790 brw_inst_set_3src_a1_src0_vstride(
791 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
792 brw_inst_set_3src_a1_src1_vstride(
793 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
794 /* no vstride on src2 */
795
796 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
797 to_3src_align1_hstride(src0.hstride));
798 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
799 to_3src_align1_hstride(src1.hstride));
800 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
801 to_3src_align1_hstride(src2.hstride));
802
803 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
804 if (src0.type == BRW_REGISTER_TYPE_NF) {
805 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
806 } else {
807 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
808 }
809 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
810 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
811
812 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
813 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
814 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
815 } else {
816 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
817 }
818 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
819 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
820
821 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
822 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
823 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
824 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
825
826 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
827 src0.file == BRW_IMMEDIATE_VALUE ||
828 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
829 src0.type == BRW_REGISTER_TYPE_NF));
830 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
831 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
832 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
833 src2.file == BRW_IMMEDIATE_VALUE);
834
835 if (devinfo->gen >= 12) {
836 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
837 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
838 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
839 } else {
840 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
841 src0.file == BRW_GENERAL_REGISTER_FILE ?
842 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
843 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
844 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
845 src1.file == BRW_GENERAL_REGISTER_FILE ?
846 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
847 BRW_ALIGN1_3SRC_ACCUMULATOR);
848 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
849 src2.file == BRW_GENERAL_REGISTER_FILE ?
850 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
851 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
852 }
853
854 } else {
855 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
856 dest.file == BRW_MESSAGE_REGISTER_FILE);
857 assert(dest.type == BRW_REGISTER_TYPE_F ||
858 dest.type == BRW_REGISTER_TYPE_DF ||
859 dest.type == BRW_REGISTER_TYPE_D ||
860 dest.type == BRW_REGISTER_TYPE_UD ||
861 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
862 if (devinfo->gen == 6) {
863 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
864 dest.file == BRW_MESSAGE_REGISTER_FILE);
865 }
866 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
867 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
868 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
869
870 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
871 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
872 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
873 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
874 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
875 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
876 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
877 src0.vstride == BRW_VERTICAL_STRIDE_0);
878
879 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
880 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
881 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
882 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
883 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
884 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
885 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
886 src1.vstride == BRW_VERTICAL_STRIDE_0);
887
888 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
889 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
890 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
891 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
892 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
893 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
894 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
895 src2.vstride == BRW_VERTICAL_STRIDE_0);
896
897 if (devinfo->gen >= 7) {
898 /* Set both the source and destination types based on dest.type,
899 * ignoring the source register types. The MAD and LRP emitters ensure
900 * that all four types are float. The BFE and BFI2 emitters, however,
901 * may send us mixed D and UD types and want us to ignore that and use
902 * the destination type.
903 */
904 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
905 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
906
907 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
908 *
909 * "Three source instructions can use operands with mixed-mode
910 * precision. When SrcType field is set to :f or :hf it defines
911 * precision for source 0 only, and fields Src1Type and Src2Type
912 * define precision for other source operands:
913 *
914 * 0b = :f. Single precision Float (32-bit).
915 * 1b = :hf. Half precision Float (16-bit)."
916 */
917 if (src1.type == BRW_REGISTER_TYPE_HF)
918 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
919
920 if (src2.type == BRW_REGISTER_TYPE_HF)
921 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
922 }
923 }
924
925 return inst;
926 }
927
928
929 /***********************************************************************
930 * Convenience routines.
931 */
932 #define ALU1(OP) \
933 brw_inst *brw_##OP(struct brw_codegen *p, \
934 struct brw_reg dest, \
935 struct brw_reg src0) \
936 { \
937 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
938 }
939
940 #define ALU2(OP) \
941 brw_inst *brw_##OP(struct brw_codegen *p, \
942 struct brw_reg dest, \
943 struct brw_reg src0, \
944 struct brw_reg src1) \
945 { \
946 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
947 }
948
949 #define ALU3(OP) \
950 brw_inst *brw_##OP(struct brw_codegen *p, \
951 struct brw_reg dest, \
952 struct brw_reg src0, \
953 struct brw_reg src1, \
954 struct brw_reg src2) \
955 { \
956 if (p->current->access_mode == BRW_ALIGN_16) { \
957 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
958 src0.swizzle = BRW_SWIZZLE_XXXX; \
959 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
960 src1.swizzle = BRW_SWIZZLE_XXXX; \
961 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
962 src2.swizzle = BRW_SWIZZLE_XXXX; \
963 } \
964 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
965 }
966
967 #define ALU3F(OP) \
968 brw_inst *brw_##OP(struct brw_codegen *p, \
969 struct brw_reg dest, \
970 struct brw_reg src0, \
971 struct brw_reg src1, \
972 struct brw_reg src2) \
973 { \
974 assert(dest.type == BRW_REGISTER_TYPE_F || \
975 dest.type == BRW_REGISTER_TYPE_DF); \
976 if (dest.type == BRW_REGISTER_TYPE_F) { \
977 assert(src0.type == BRW_REGISTER_TYPE_F); \
978 assert(src1.type == BRW_REGISTER_TYPE_F); \
979 assert(src2.type == BRW_REGISTER_TYPE_F); \
980 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
981 assert(src0.type == BRW_REGISTER_TYPE_DF); \
982 assert(src1.type == BRW_REGISTER_TYPE_DF); \
983 assert(src2.type == BRW_REGISTER_TYPE_DF); \
984 } \
985 \
986 if (p->current->access_mode == BRW_ALIGN_16) { \
987 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
988 src0.swizzle = BRW_SWIZZLE_XXXX; \
989 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
990 src1.swizzle = BRW_SWIZZLE_XXXX; \
991 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
992 src2.swizzle = BRW_SWIZZLE_XXXX; \
993 } \
994 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
995 }
996
997 /* Rounding operations (other than RNDD) require two instructions - the first
998 * stores a rounded value (possibly the wrong way) in the dest register, but
999 * also sets a per-channel "increment bit" in the flag register. A predicated
1000 * add of 1.0 fixes dest to contain the desired result.
1001 *
1002 * Sandybridge and later appear to round correctly without an ADD.
1003 */
1004 #define ROUND(OP) \
1005 void brw_##OP(struct brw_codegen *p, \
1006 struct brw_reg dest, \
1007 struct brw_reg src) \
1008 { \
1009 const struct gen_device_info *devinfo = p->devinfo; \
1010 brw_inst *rnd, *add; \
1011 rnd = next_insn(p, BRW_OPCODE_##OP); \
1012 brw_set_dest(p, rnd, dest); \
1013 brw_set_src0(p, rnd, src); \
1014 \
1015 if (devinfo->gen < 6) { \
1016 /* turn on round-increments */ \
1017 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1018 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1019 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1020 } \
1021 }
1022
1023
1024 ALU2(SEL)
1025 ALU1(NOT)
1026 ALU2(AND)
1027 ALU2(OR)
1028 ALU2(XOR)
1029 ALU2(SHR)
1030 ALU2(SHL)
1031 ALU1(DIM)
1032 ALU2(ASR)
1033 ALU2(ROL)
1034 ALU2(ROR)
1035 ALU3(CSEL)
1036 ALU1(FRC)
1037 ALU1(RNDD)
1038 ALU2(MAC)
1039 ALU2(MACH)
1040 ALU1(LZD)
1041 ALU2(DP4)
1042 ALU2(DPH)
1043 ALU2(DP3)
1044 ALU2(DP2)
1045 ALU3(MAD)
1046 ALU3F(LRP)
1047 ALU1(BFREV)
1048 ALU3(BFE)
1049 ALU2(BFI1)
1050 ALU3(BFI2)
1051 ALU1(FBH)
1052 ALU1(FBL)
1053 ALU1(CBIT)
1054 ALU2(ADDC)
1055 ALU2(SUBB)
1056
1057 ROUND(RNDZ)
1058 ROUND(RNDE)
1059
1060 brw_inst *
1061 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1062 {
1063 const struct gen_device_info *devinfo = p->devinfo;
1064
1065 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1066 * To avoid the problems that causes, we use an <X,2,0> source region to
1067 * read each element twice.
1068 */
1069 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1070 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1071 dest.type == BRW_REGISTER_TYPE_DF &&
1072 (src0.type == BRW_REGISTER_TYPE_F ||
1073 src0.type == BRW_REGISTER_TYPE_D ||
1074 src0.type == BRW_REGISTER_TYPE_UD) &&
1075 !has_scalar_region(src0)) {
1076 assert(src0.vstride == src0.width + src0.hstride);
1077 src0.vstride = src0.hstride;
1078 src0.width = BRW_WIDTH_2;
1079 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1080 }
1081
1082 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1083 }
1084
1085 brw_inst *
1086 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1087 struct brw_reg src0, struct brw_reg src1)
1088 {
1089 /* 6.2.2: add */
1090 if (src0.type == BRW_REGISTER_TYPE_F ||
1091 (src0.file == BRW_IMMEDIATE_VALUE &&
1092 src0.type == BRW_REGISTER_TYPE_VF)) {
1093 assert(src1.type != BRW_REGISTER_TYPE_UD);
1094 assert(src1.type != BRW_REGISTER_TYPE_D);
1095 }
1096
1097 if (src1.type == BRW_REGISTER_TYPE_F ||
1098 (src1.file == BRW_IMMEDIATE_VALUE &&
1099 src1.type == BRW_REGISTER_TYPE_VF)) {
1100 assert(src0.type != BRW_REGISTER_TYPE_UD);
1101 assert(src0.type != BRW_REGISTER_TYPE_D);
1102 }
1103
1104 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1105 }
1106
1107 brw_inst *
1108 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1109 struct brw_reg src0, struct brw_reg src1)
1110 {
1111 assert(dest.type == src0.type);
1112 assert(src0.type == src1.type);
1113 switch (src0.type) {
1114 case BRW_REGISTER_TYPE_B:
1115 case BRW_REGISTER_TYPE_UB:
1116 case BRW_REGISTER_TYPE_W:
1117 case BRW_REGISTER_TYPE_UW:
1118 case BRW_REGISTER_TYPE_D:
1119 case BRW_REGISTER_TYPE_UD:
1120 break;
1121 default:
1122 unreachable("Bad type for brw_AVG");
1123 }
1124
1125 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1126 }
1127
1128 brw_inst *
1129 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1130 struct brw_reg src0, struct brw_reg src1)
1131 {
1132 /* 6.32.38: mul */
1133 if (src0.type == BRW_REGISTER_TYPE_D ||
1134 src0.type == BRW_REGISTER_TYPE_UD ||
1135 src1.type == BRW_REGISTER_TYPE_D ||
1136 src1.type == BRW_REGISTER_TYPE_UD) {
1137 assert(dest.type != BRW_REGISTER_TYPE_F);
1138 }
1139
1140 if (src0.type == BRW_REGISTER_TYPE_F ||
1141 (src0.file == BRW_IMMEDIATE_VALUE &&
1142 src0.type == BRW_REGISTER_TYPE_VF)) {
1143 assert(src1.type != BRW_REGISTER_TYPE_UD);
1144 assert(src1.type != BRW_REGISTER_TYPE_D);
1145 }
1146
1147 if (src1.type == BRW_REGISTER_TYPE_F ||
1148 (src1.file == BRW_IMMEDIATE_VALUE &&
1149 src1.type == BRW_REGISTER_TYPE_VF)) {
1150 assert(src0.type != BRW_REGISTER_TYPE_UD);
1151 assert(src0.type != BRW_REGISTER_TYPE_D);
1152 }
1153
1154 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1155 src0.nr != BRW_ARF_ACCUMULATOR);
1156 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1157 src1.nr != BRW_ARF_ACCUMULATOR);
1158
1159 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1160 }
1161
1162 brw_inst *
1163 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1164 struct brw_reg src0, struct brw_reg src1)
1165 {
1166 src0.vstride = BRW_VERTICAL_STRIDE_0;
1167 src0.width = BRW_WIDTH_1;
1168 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1169 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1170 }
1171
1172 brw_inst *
1173 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1174 struct brw_reg src0, struct brw_reg src1)
1175 {
1176 src0.vstride = BRW_VERTICAL_STRIDE_0;
1177 src0.width = BRW_WIDTH_1;
1178 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1179 src1.vstride = BRW_VERTICAL_STRIDE_8;
1180 src1.width = BRW_WIDTH_8;
1181 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1182 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1183 }
1184
1185 brw_inst *
1186 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1187 {
1188 const struct gen_device_info *devinfo = p->devinfo;
1189 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1190 /* The F32TO16 instruction doesn't support 32-bit destination types in
1191 * Align1 mode, and neither does the Gen8 implementation in terms of a
1192 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1193 * an undocumented feature.
1194 */
1195 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1196 (!align16 || devinfo->gen >= 8));
1197 brw_inst *inst;
1198
1199 if (align16) {
1200 assert(dst.type == BRW_REGISTER_TYPE_UD);
1201 } else {
1202 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1203 dst.type == BRW_REGISTER_TYPE_W ||
1204 dst.type == BRW_REGISTER_TYPE_UW ||
1205 dst.type == BRW_REGISTER_TYPE_HF);
1206 }
1207
1208 brw_push_insn_state(p);
1209
1210 if (needs_zero_fill) {
1211 brw_set_default_access_mode(p, BRW_ALIGN_1);
1212 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1213 }
1214
1215 if (devinfo->gen >= 8) {
1216 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1217 } else {
1218 assert(devinfo->gen == 7);
1219 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1220 }
1221
1222 if (needs_zero_fill) {
1223 brw_inst_set_no_dd_clear(devinfo, inst, true);
1224 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1225 brw_inst_set_no_dd_check(devinfo, inst, true);
1226 }
1227
1228 brw_pop_insn_state(p);
1229 return inst;
1230 }
1231
1232 brw_inst *
1233 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1234 {
1235 const struct gen_device_info *devinfo = p->devinfo;
1236 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1237
1238 if (align16) {
1239 assert(src.type == BRW_REGISTER_TYPE_UD);
1240 } else {
1241 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1242 *
1243 * Because this instruction does not have a 16-bit floating-point
1244 * type, the source data type must be Word (W). The destination type
1245 * must be F (Float).
1246 */
1247 if (src.type == BRW_REGISTER_TYPE_UD)
1248 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1249
1250 assert(src.type == BRW_REGISTER_TYPE_W ||
1251 src.type == BRW_REGISTER_TYPE_UW ||
1252 src.type == BRW_REGISTER_TYPE_HF);
1253 }
1254
1255 if (devinfo->gen >= 8) {
1256 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1257 } else {
1258 assert(devinfo->gen == 7);
1259 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1260 }
1261 }
1262
1263
1264 void brw_NOP(struct brw_codegen *p)
1265 {
1266 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1267 memset(insn, 0, sizeof(*insn));
1268 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1269 }
1270
1271
1272
1273
1274
1275 /***********************************************************************
1276 * Comparisons, if/else/endif
1277 */
1278
1279 brw_inst *
1280 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1281 unsigned predicate_control)
1282 {
1283 const struct gen_device_info *devinfo = p->devinfo;
1284 struct brw_reg ip = brw_ip_reg();
1285 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1286
1287 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1288 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1289 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1290 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1291
1292 return inst;
1293 }
1294
1295 static void
1296 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1297 {
1298 p->if_stack[p->if_stack_depth] = inst - p->store;
1299
1300 p->if_stack_depth++;
1301 if (p->if_stack_array_size <= p->if_stack_depth) {
1302 p->if_stack_array_size *= 2;
1303 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1304 p->if_stack_array_size);
1305 }
1306 }
1307
1308 static brw_inst *
1309 pop_if_stack(struct brw_codegen *p)
1310 {
1311 p->if_stack_depth--;
1312 return &p->store[p->if_stack[p->if_stack_depth]];
1313 }
1314
1315 static void
1316 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1317 {
1318 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1319 p->loop_stack_array_size *= 2;
1320 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1321 p->loop_stack_array_size);
1322 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1323 p->loop_stack_array_size);
1324 }
1325
1326 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1327 p->loop_stack_depth++;
1328 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1329 }
1330
1331 static brw_inst *
1332 get_inner_do_insn(struct brw_codegen *p)
1333 {
1334 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1335 }
1336
1337 /* EU takes the value from the flag register and pushes it onto some
1338 * sort of a stack (presumably merging with any flag value already on
1339 * the stack). Within an if block, the flags at the top of the stack
1340 * control execution on each channel of the unit, eg. on each of the
1341 * 16 pixel values in our wm programs.
1342 *
1343 * When the matching 'else' instruction is reached (presumably by
1344 * countdown of the instruction count patched in by our ELSE/ENDIF
1345 * functions), the relevant flags are inverted.
1346 *
1347 * When the matching 'endif' instruction is reached, the flags are
1348 * popped off. If the stack is now empty, normal execution resumes.
1349 */
1350 brw_inst *
1351 brw_IF(struct brw_codegen *p, unsigned execute_size)
1352 {
1353 const struct gen_device_info *devinfo = p->devinfo;
1354 brw_inst *insn;
1355
1356 insn = next_insn(p, BRW_OPCODE_IF);
1357
1358 /* Override the defaults for this instruction:
1359 */
1360 if (devinfo->gen < 6) {
1361 brw_set_dest(p, insn, brw_ip_reg());
1362 brw_set_src0(p, insn, brw_ip_reg());
1363 brw_set_src1(p, insn, brw_imm_d(0x0));
1364 } else if (devinfo->gen == 6) {
1365 brw_set_dest(p, insn, brw_imm_w(0));
1366 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1367 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1368 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1369 } else if (devinfo->gen == 7) {
1370 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1371 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1372 brw_set_src1(p, insn, brw_imm_w(0));
1373 brw_inst_set_jip(devinfo, insn, 0);
1374 brw_inst_set_uip(devinfo, insn, 0);
1375 } else {
1376 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1377 if (devinfo->gen < 12)
1378 brw_set_src0(p, insn, brw_imm_d(0));
1379 brw_inst_set_jip(devinfo, insn, 0);
1380 brw_inst_set_uip(devinfo, insn, 0);
1381 }
1382
1383 brw_inst_set_exec_size(devinfo, insn, execute_size);
1384 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1385 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1386 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1387 if (!p->single_program_flow && devinfo->gen < 6)
1388 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1389
1390 push_if_stack(p, insn);
1391 p->if_depth_in_loop[p->loop_stack_depth]++;
1392 return insn;
1393 }
1394
1395 /* This function is only used for gen6-style IF instructions with an
1396 * embedded comparison (conditional modifier). It is not used on gen7.
1397 */
1398 brw_inst *
1399 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1400 struct brw_reg src0, struct brw_reg src1)
1401 {
1402 const struct gen_device_info *devinfo = p->devinfo;
1403 brw_inst *insn;
1404
1405 insn = next_insn(p, BRW_OPCODE_IF);
1406
1407 brw_set_dest(p, insn, brw_imm_w(0));
1408 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1409 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1410 brw_set_src0(p, insn, src0);
1411 brw_set_src1(p, insn, src1);
1412
1413 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1414 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1415 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1416
1417 push_if_stack(p, insn);
1418 return insn;
1419 }
1420
1421 /**
1422 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1423 */
1424 static void
1425 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1426 brw_inst *if_inst, brw_inst *else_inst)
1427 {
1428 const struct gen_device_info *devinfo = p->devinfo;
1429
1430 /* The next instruction (where the ENDIF would be, if it existed) */
1431 brw_inst *next_inst = &p->store[p->nr_insn];
1432
1433 assert(p->single_program_flow);
1434 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1435 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1436 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1437
1438 /* Convert IF to an ADD instruction that moves the instruction pointer
1439 * to the first instruction of the ELSE block. If there is no ELSE
1440 * block, point to where ENDIF would be. Reverse the predicate.
1441 *
1442 * There's no need to execute an ENDIF since we don't need to do any
1443 * stack operations, and if we're currently executing, we just want to
1444 * continue normally.
1445 */
1446 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1447 brw_inst_set_pred_inv(devinfo, if_inst, true);
1448
1449 if (else_inst != NULL) {
1450 /* Convert ELSE to an ADD instruction that points where the ENDIF
1451 * would be.
1452 */
1453 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1454
1455 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1456 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1457 } else {
1458 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1459 }
1460 }
1461
1462 /**
1463 * Patch IF and ELSE instructions with appropriate jump targets.
1464 */
1465 static void
1466 patch_IF_ELSE(struct brw_codegen *p,
1467 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1468 {
1469 const struct gen_device_info *devinfo = p->devinfo;
1470
1471 /* We shouldn't be patching IF and ELSE instructions in single program flow
1472 * mode when gen < 6, because in single program flow mode on those
1473 * platforms, we convert flow control instructions to conditional ADDs that
1474 * operate on IP (see brw_ENDIF).
1475 *
1476 * However, on Gen6, writing to IP doesn't work in single program flow mode
1477 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1478 * not be updated by non-flow control instructions."). And on later
1479 * platforms, there is no significant benefit to converting control flow
1480 * instructions to conditional ADDs. So we do patch IF and ELSE
1481 * instructions in single program flow mode on those platforms.
1482 */
1483 if (devinfo->gen < 6)
1484 assert(!p->single_program_flow);
1485
1486 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1487 assert(endif_inst != NULL);
1488 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1489
1490 unsigned br = brw_jump_scale(devinfo);
1491
1492 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1493 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1494
1495 if (else_inst == NULL) {
1496 /* Patch IF -> ENDIF */
1497 if (devinfo->gen < 6) {
1498 /* Turn it into an IFF, which means no mask stack operations for
1499 * all-false and jumping past the ENDIF.
1500 */
1501 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1502 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1503 br * (endif_inst - if_inst + 1));
1504 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1505 } else if (devinfo->gen == 6) {
1506 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1507 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1508 } else {
1509 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1510 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1511 }
1512 } else {
1513 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1514
1515 /* Patch IF -> ELSE */
1516 if (devinfo->gen < 6) {
1517 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1518 br * (else_inst - if_inst));
1519 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1520 } else if (devinfo->gen == 6) {
1521 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1522 br * (else_inst - if_inst + 1));
1523 }
1524
1525 /* Patch ELSE -> ENDIF */
1526 if (devinfo->gen < 6) {
1527 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1528 * matching ENDIF.
1529 */
1530 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1531 br * (endif_inst - else_inst + 1));
1532 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1533 } else if (devinfo->gen == 6) {
1534 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1535 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1536 br * (endif_inst - else_inst));
1537 } else {
1538 /* The IF instruction's JIP should point just past the ELSE */
1539 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1540 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1541 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1542 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1543 if (devinfo->gen >= 8) {
1544 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1545 * should point to ENDIF.
1546 */
1547 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1548 }
1549 }
1550 }
1551 }
1552
1553 void
1554 brw_ELSE(struct brw_codegen *p)
1555 {
1556 const struct gen_device_info *devinfo = p->devinfo;
1557 brw_inst *insn;
1558
1559 insn = next_insn(p, BRW_OPCODE_ELSE);
1560
1561 if (devinfo->gen < 6) {
1562 brw_set_dest(p, insn, brw_ip_reg());
1563 brw_set_src0(p, insn, brw_ip_reg());
1564 brw_set_src1(p, insn, brw_imm_d(0x0));
1565 } else if (devinfo->gen == 6) {
1566 brw_set_dest(p, insn, brw_imm_w(0));
1567 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1568 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1569 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1570 } else if (devinfo->gen == 7) {
1571 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1572 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1573 brw_set_src1(p, insn, brw_imm_w(0));
1574 brw_inst_set_jip(devinfo, insn, 0);
1575 brw_inst_set_uip(devinfo, insn, 0);
1576 } else {
1577 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1578 if (devinfo->gen < 12)
1579 brw_set_src0(p, insn, brw_imm_d(0));
1580 brw_inst_set_jip(devinfo, insn, 0);
1581 brw_inst_set_uip(devinfo, insn, 0);
1582 }
1583
1584 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1585 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1586 if (!p->single_program_flow && devinfo->gen < 6)
1587 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1588
1589 push_if_stack(p, insn);
1590 }
1591
1592 void
1593 brw_ENDIF(struct brw_codegen *p)
1594 {
1595 const struct gen_device_info *devinfo = p->devinfo;
1596 brw_inst *insn = NULL;
1597 brw_inst *else_inst = NULL;
1598 brw_inst *if_inst = NULL;
1599 brw_inst *tmp;
1600 bool emit_endif = true;
1601
1602 /* In single program flow mode, we can express IF and ELSE instructions
1603 * equivalently as ADD instructions that operate on IP. On platforms prior
1604 * to Gen6, flow control instructions cause an implied thread switch, so
1605 * this is a significant savings.
1606 *
1607 * However, on Gen6, writing to IP doesn't work in single program flow mode
1608 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1609 * not be updated by non-flow control instructions."). And on later
1610 * platforms, there is no significant benefit to converting control flow
1611 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1612 * Gen5.
1613 */
1614 if (devinfo->gen < 6 && p->single_program_flow)
1615 emit_endif = false;
1616
1617 /*
1618 * A single next_insn() may change the base address of instruction store
1619 * memory(p->store), so call it first before referencing the instruction
1620 * store pointer from an index
1621 */
1622 if (emit_endif)
1623 insn = next_insn(p, BRW_OPCODE_ENDIF);
1624
1625 /* Pop the IF and (optional) ELSE instructions from the stack */
1626 p->if_depth_in_loop[p->loop_stack_depth]--;
1627 tmp = pop_if_stack(p);
1628 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1629 else_inst = tmp;
1630 tmp = pop_if_stack(p);
1631 }
1632 if_inst = tmp;
1633
1634 if (!emit_endif) {
1635 /* ENDIF is useless; don't bother emitting it. */
1636 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1637 return;
1638 }
1639
1640 if (devinfo->gen < 6) {
1641 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1642 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1643 brw_set_src1(p, insn, brw_imm_d(0x0));
1644 } else if (devinfo->gen == 6) {
1645 brw_set_dest(p, insn, brw_imm_w(0));
1646 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1647 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648 } else if (devinfo->gen == 7) {
1649 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1650 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1651 brw_set_src1(p, insn, brw_imm_w(0));
1652 } else {
1653 brw_set_src0(p, insn, brw_imm_d(0));
1654 }
1655
1656 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1657 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1658 if (devinfo->gen < 6)
1659 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1660
1661 /* Also pop item off the stack in the endif instruction: */
1662 if (devinfo->gen < 6) {
1663 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1664 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1665 } else if (devinfo->gen == 6) {
1666 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1667 } else {
1668 brw_inst_set_jip(devinfo, insn, 2);
1669 }
1670 patch_IF_ELSE(p, if_inst, else_inst, insn);
1671 }
1672
1673 brw_inst *
1674 brw_BREAK(struct brw_codegen *p)
1675 {
1676 const struct gen_device_info *devinfo = p->devinfo;
1677 brw_inst *insn;
1678
1679 insn = next_insn(p, BRW_OPCODE_BREAK);
1680 if (devinfo->gen >= 8) {
1681 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1682 brw_set_src0(p, insn, brw_imm_d(0x0));
1683 } else if (devinfo->gen >= 6) {
1684 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1685 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1686 brw_set_src1(p, insn, brw_imm_d(0x0));
1687 } else {
1688 brw_set_dest(p, insn, brw_ip_reg());
1689 brw_set_src0(p, insn, brw_ip_reg());
1690 brw_set_src1(p, insn, brw_imm_d(0x0));
1691 brw_inst_set_gen4_pop_count(devinfo, insn,
1692 p->if_depth_in_loop[p->loop_stack_depth]);
1693 }
1694 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1695 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1696
1697 return insn;
1698 }
1699
1700 brw_inst *
1701 brw_CONT(struct brw_codegen *p)
1702 {
1703 const struct gen_device_info *devinfo = p->devinfo;
1704 brw_inst *insn;
1705
1706 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1707 brw_set_dest(p, insn, brw_ip_reg());
1708 if (devinfo->gen >= 8) {
1709 brw_set_src0(p, insn, brw_imm_d(0x0));
1710 } else {
1711 brw_set_src0(p, insn, brw_ip_reg());
1712 brw_set_src1(p, insn, brw_imm_d(0x0));
1713 }
1714
1715 if (devinfo->gen < 6) {
1716 brw_inst_set_gen4_pop_count(devinfo, insn,
1717 p->if_depth_in_loop[p->loop_stack_depth]);
1718 }
1719 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1720 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1721 return insn;
1722 }
1723
1724 brw_inst *
1725 gen6_HALT(struct brw_codegen *p)
1726 {
1727 const struct gen_device_info *devinfo = p->devinfo;
1728 brw_inst *insn;
1729
1730 insn = next_insn(p, BRW_OPCODE_HALT);
1731 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1732 if (devinfo->gen < 8) {
1733 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1734 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1735 } else if (devinfo->gen < 12) {
1736 brw_set_src0(p, insn, brw_imm_d(0x0));
1737 }
1738
1739 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1740 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1741 return insn;
1742 }
1743
1744 /* DO/WHILE loop:
1745 *
1746 * The DO/WHILE is just an unterminated loop -- break or continue are
1747 * used for control within the loop. We have a few ways they can be
1748 * done.
1749 *
1750 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1751 * jip and no DO instruction.
1752 *
1753 * For non-uniform control flow pre-gen6, there's a DO instruction to
1754 * push the mask, and a WHILE to jump back, and BREAK to get out and
1755 * pop the mask.
1756 *
1757 * For gen6, there's no more mask stack, so no need for DO. WHILE
1758 * just points back to the first instruction of the loop.
1759 */
1760 brw_inst *
1761 brw_DO(struct brw_codegen *p, unsigned execute_size)
1762 {
1763 const struct gen_device_info *devinfo = p->devinfo;
1764
1765 if (devinfo->gen >= 6 || p->single_program_flow) {
1766 push_loop_stack(p, &p->store[p->nr_insn]);
1767 return &p->store[p->nr_insn];
1768 } else {
1769 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1770
1771 push_loop_stack(p, insn);
1772
1773 /* Override the defaults for this instruction:
1774 */
1775 brw_set_dest(p, insn, brw_null_reg());
1776 brw_set_src0(p, insn, brw_null_reg());
1777 brw_set_src1(p, insn, brw_null_reg());
1778
1779 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1780 brw_inst_set_exec_size(devinfo, insn, execute_size);
1781 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1782
1783 return insn;
1784 }
1785 }
1786
1787 /**
1788 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1789 * instruction here.
1790 *
1791 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1792 * nesting, since it can always just point to the end of the block/current loop.
1793 */
1794 static void
1795 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1796 {
1797 const struct gen_device_info *devinfo = p->devinfo;
1798 brw_inst *do_inst = get_inner_do_insn(p);
1799 brw_inst *inst;
1800 unsigned br = brw_jump_scale(devinfo);
1801
1802 assert(devinfo->gen < 6);
1803
1804 for (inst = while_inst - 1; inst != do_inst; inst--) {
1805 /* If the jump count is != 0, that means that this instruction has already
1806 * been patched because it's part of a loop inside of the one we're
1807 * patching.
1808 */
1809 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1810 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1811 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1812 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1813 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1814 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1815 }
1816 }
1817 }
1818
1819 brw_inst *
1820 brw_WHILE(struct brw_codegen *p)
1821 {
1822 const struct gen_device_info *devinfo = p->devinfo;
1823 brw_inst *insn, *do_insn;
1824 unsigned br = brw_jump_scale(devinfo);
1825
1826 if (devinfo->gen >= 6) {
1827 insn = next_insn(p, BRW_OPCODE_WHILE);
1828 do_insn = get_inner_do_insn(p);
1829
1830 if (devinfo->gen >= 8) {
1831 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1832 if (devinfo->gen < 12)
1833 brw_set_src0(p, insn, brw_imm_d(0));
1834 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1835 } else if (devinfo->gen == 7) {
1836 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1837 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1838 brw_set_src1(p, insn, brw_imm_w(0));
1839 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1840 } else {
1841 brw_set_dest(p, insn, brw_imm_w(0));
1842 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1843 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1844 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1845 }
1846
1847 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1848
1849 } else {
1850 if (p->single_program_flow) {
1851 insn = next_insn(p, BRW_OPCODE_ADD);
1852 do_insn = get_inner_do_insn(p);
1853
1854 brw_set_dest(p, insn, brw_ip_reg());
1855 brw_set_src0(p, insn, brw_ip_reg());
1856 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1857 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1858 } else {
1859 insn = next_insn(p, BRW_OPCODE_WHILE);
1860 do_insn = get_inner_do_insn(p);
1861
1862 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1863
1864 brw_set_dest(p, insn, brw_ip_reg());
1865 brw_set_src0(p, insn, brw_ip_reg());
1866 brw_set_src1(p, insn, brw_imm_d(0));
1867
1868 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1869 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1870 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1871
1872 brw_patch_break_cont(p, insn);
1873 }
1874 }
1875 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1876
1877 p->loop_stack_depth--;
1878
1879 return insn;
1880 }
1881
1882 /* FORWARD JUMPS:
1883 */
1884 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1885 {
1886 const struct gen_device_info *devinfo = p->devinfo;
1887 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1888 unsigned jmpi = 1;
1889
1890 if (devinfo->gen >= 5)
1891 jmpi = 2;
1892
1893 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1894 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1895
1896 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1897 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1898 }
1899
1900 /* To integrate with the above, it makes sense that the comparison
1901 * instruction should populate the flag register. It might be simpler
1902 * just to use the flag reg for most WM tasks?
1903 */
1904 void brw_CMP(struct brw_codegen *p,
1905 struct brw_reg dest,
1906 unsigned conditional,
1907 struct brw_reg src0,
1908 struct brw_reg src1)
1909 {
1910 const struct gen_device_info *devinfo = p->devinfo;
1911 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1912
1913 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1914 brw_set_dest(p, insn, dest);
1915 brw_set_src0(p, insn, src0);
1916 brw_set_src1(p, insn, src1);
1917
1918 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1919 * page says:
1920 * "Any CMP instruction with a null destination must use a {switch}."
1921 *
1922 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1923 * mentioned on their work-arounds pages.
1924 */
1925 if (devinfo->gen == 7) {
1926 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1927 dest.nr == BRW_ARF_NULL) {
1928 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1929 }
1930 }
1931 }
1932
1933 /***********************************************************************
1934 * Helpers for the various SEND message types:
1935 */
1936
1937 /** Extended math function, float[8].
1938 */
1939 void gen4_math(struct brw_codegen *p,
1940 struct brw_reg dest,
1941 unsigned function,
1942 unsigned msg_reg_nr,
1943 struct brw_reg src,
1944 unsigned precision )
1945 {
1946 const struct gen_device_info *devinfo = p->devinfo;
1947 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1948 unsigned data_type;
1949 if (has_scalar_region(src)) {
1950 data_type = BRW_MATH_DATA_SCALAR;
1951 } else {
1952 data_type = BRW_MATH_DATA_VECTOR;
1953 }
1954
1955 assert(devinfo->gen < 6);
1956
1957 /* Example code doesn't set predicate_control for send
1958 * instructions.
1959 */
1960 brw_inst_set_pred_control(devinfo, insn, 0);
1961 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1962
1963 brw_set_dest(p, insn, dest);
1964 brw_set_src0(p, insn, src);
1965 brw_set_math_message(p,
1966 insn,
1967 function,
1968 src.type == BRW_REGISTER_TYPE_D,
1969 precision,
1970 data_type);
1971 }
1972
1973 void gen6_math(struct brw_codegen *p,
1974 struct brw_reg dest,
1975 unsigned function,
1976 struct brw_reg src0,
1977 struct brw_reg src1)
1978 {
1979 const struct gen_device_info *devinfo = p->devinfo;
1980 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1981
1982 assert(devinfo->gen >= 6);
1983
1984 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1985 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1986
1987 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1988 if (devinfo->gen == 6) {
1989 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1990 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1991 }
1992
1993 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1994 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1995 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1996 assert(src0.type != BRW_REGISTER_TYPE_F);
1997 assert(src1.type != BRW_REGISTER_TYPE_F);
1998 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1999 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2000 } else {
2001 assert(src0.type == BRW_REGISTER_TYPE_F ||
2002 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2003 assert(src1.type == BRW_REGISTER_TYPE_F ||
2004 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2005 }
2006
2007 /* Source modifiers are ignored for extended math instructions on Gen6. */
2008 if (devinfo->gen == 6) {
2009 assert(!src0.negate);
2010 assert(!src0.abs);
2011 assert(!src1.negate);
2012 assert(!src1.abs);
2013 }
2014
2015 brw_inst_set_math_function(devinfo, insn, function);
2016
2017 brw_set_dest(p, insn, dest);
2018 brw_set_src0(p, insn, src0);
2019 brw_set_src1(p, insn, src1);
2020 }
2021
2022 /**
2023 * Return the right surface index to access the thread scratch space using
2024 * stateless dataport messages.
2025 */
2026 unsigned
2027 brw_scratch_surface_idx(const struct brw_codegen *p)
2028 {
2029 /* The scratch space is thread-local so IA coherency is unnecessary. */
2030 if (p->devinfo->gen >= 8)
2031 return GEN8_BTI_STATELESS_NON_COHERENT;
2032 else
2033 return BRW_BTI_STATELESS;
2034 }
2035
2036 /**
2037 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2038 * using a constant offset per channel.
2039 *
2040 * The offset must be aligned to oword size (16 bytes). Used for
2041 * register spilling.
2042 */
2043 void brw_oword_block_write_scratch(struct brw_codegen *p,
2044 struct brw_reg mrf,
2045 int num_regs,
2046 unsigned offset)
2047 {
2048 const struct gen_device_info *devinfo = p->devinfo;
2049 const unsigned target_cache =
2050 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2051 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2052 BRW_SFID_DATAPORT_WRITE);
2053 uint32_t msg_type;
2054
2055 if (devinfo->gen >= 6)
2056 offset /= 16;
2057
2058 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2059
2060 const unsigned mlen = 1 + num_regs;
2061
2062 /* Set up the message header. This is g0, with g0.2 filled with
2063 * the offset. We don't want to leave our offset around in g0 or
2064 * it'll screw up texture samples, so set it up inside the message
2065 * reg.
2066 */
2067 {
2068 brw_push_insn_state(p);
2069 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2070 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2071 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2072
2073 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2074
2075 /* set message header global offset field (reg 0, element 2) */
2076 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2077 brw_MOV(p,
2078 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2079 mrf.nr,
2080 2), BRW_REGISTER_TYPE_UD),
2081 brw_imm_ud(offset));
2082
2083 brw_pop_insn_state(p);
2084 }
2085
2086 {
2087 struct brw_reg dest;
2088 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2089 int send_commit_msg;
2090 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2091 BRW_REGISTER_TYPE_UW);
2092
2093 brw_inst_set_sfid(devinfo, insn, target_cache);
2094 brw_inst_set_compression(devinfo, insn, false);
2095
2096 if (brw_inst_exec_size(devinfo, insn) >= 16)
2097 src_header = vec16(src_header);
2098
2099 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2100 if (devinfo->gen < 6)
2101 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2102
2103 /* Until gen6, writes followed by reads from the same location
2104 * are not guaranteed to be ordered unless write_commit is set.
2105 * If set, then a no-op write is issued to the destination
2106 * register to set a dependency, and a read from the destination
2107 * can be used to ensure the ordering.
2108 *
2109 * For gen6, only writes between different threads need ordering
2110 * protection. Our use of DP writes is all about register
2111 * spilling within a thread.
2112 */
2113 if (devinfo->gen >= 6) {
2114 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2115 send_commit_msg = 0;
2116 } else {
2117 dest = src_header;
2118 send_commit_msg = 1;
2119 }
2120
2121 brw_set_dest(p, insn, dest);
2122 if (devinfo->gen >= 6) {
2123 brw_set_src0(p, insn, mrf);
2124 } else {
2125 brw_set_src0(p, insn, brw_null_reg());
2126 }
2127
2128 if (devinfo->gen >= 6)
2129 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2130 else
2131 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2132
2133 brw_set_desc(p, insn,
2134 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2135 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2136 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2137 msg_type, 0, /* not a render target */
2138 send_commit_msg));
2139 }
2140 }
2141
2142
2143 /**
2144 * Read a block of owords (half a GRF each) from the scratch buffer
2145 * using a constant index per channel.
2146 *
2147 * Offset must be aligned to oword size (16 bytes). Used for register
2148 * spilling.
2149 */
2150 void
2151 brw_oword_block_read_scratch(struct brw_codegen *p,
2152 struct brw_reg dest,
2153 struct brw_reg mrf,
2154 int num_regs,
2155 unsigned offset)
2156 {
2157 const struct gen_device_info *devinfo = p->devinfo;
2158
2159 if (devinfo->gen >= 6)
2160 offset /= 16;
2161
2162 if (p->devinfo->gen >= 7) {
2163 /* On gen 7 and above, we no longer have message registers and we can
2164 * send from any register we want. By using the destination register
2165 * for the message, we guarantee that the implied message write won't
2166 * accidentally overwrite anything. This has been a problem because
2167 * the MRF registers and source for the final FB write are both fixed
2168 * and may overlap.
2169 */
2170 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2171 } else {
2172 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2173 }
2174 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2175
2176 const unsigned rlen = num_regs;
2177 const unsigned target_cache =
2178 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2179 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2180 BRW_SFID_DATAPORT_READ);
2181
2182 {
2183 brw_push_insn_state(p);
2184 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2185 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2186 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2187
2188 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2189
2190 /* set message header global offset field (reg 0, element 2) */
2191 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2192 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2193
2194 brw_pop_insn_state(p);
2195 }
2196
2197 {
2198 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2199
2200 brw_inst_set_sfid(devinfo, insn, target_cache);
2201 assert(brw_inst_pred_control(devinfo, insn) == 0);
2202 brw_inst_set_compression(devinfo, insn, false);
2203
2204 brw_set_dest(p, insn, dest); /* UW? */
2205 if (devinfo->gen >= 6) {
2206 brw_set_src0(p, insn, mrf);
2207 } else {
2208 brw_set_src0(p, insn, brw_null_reg());
2209 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2210 }
2211
2212 brw_set_desc(p, insn,
2213 brw_message_desc(devinfo, 1, rlen, true) |
2214 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2215 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2216 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2217 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2218 }
2219 }
2220
2221 void
2222 gen7_block_read_scratch(struct brw_codegen *p,
2223 struct brw_reg dest,
2224 int num_regs,
2225 unsigned offset)
2226 {
2227 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2228 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2229
2230 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2231
2232 /* The HW requires that the header is present; this is to get the g0.5
2233 * scratch offset.
2234 */
2235 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2236
2237 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2238 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2239 * is 32 bytes, which happens to be the size of a register.
2240 */
2241 offset /= REG_SIZE;
2242 assert(offset < (1 << 12));
2243
2244 gen7_set_dp_scratch_message(p, insn,
2245 false, /* scratch read */
2246 false, /* OWords */
2247 false, /* invalidate after read */
2248 num_regs,
2249 offset,
2250 1, /* mlen: just g0 */
2251 num_regs, /* rlen */
2252 true); /* header present */
2253 }
2254
2255 /**
2256 * Read float[4] vectors from the data port constant cache.
2257 * Location (in buffer) should be a multiple of 16.
2258 * Used for fetching shader constants.
2259 */
2260 void brw_oword_block_read(struct brw_codegen *p,
2261 struct brw_reg dest,
2262 struct brw_reg mrf,
2263 uint32_t offset,
2264 uint32_t bind_table_index)
2265 {
2266 const struct gen_device_info *devinfo = p->devinfo;
2267 const unsigned target_cache =
2268 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2269 BRW_SFID_DATAPORT_READ);
2270 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2271
2272 /* On newer hardware, offset is in units of owords. */
2273 if (devinfo->gen >= 6)
2274 offset /= 16;
2275
2276 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2277
2278 brw_push_insn_state(p);
2279 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2280 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2281 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2282
2283 brw_push_insn_state(p);
2284 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2285 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2286
2287 /* set message header global offset field (reg 0, element 2) */
2288 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2289 brw_MOV(p,
2290 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2291 mrf.nr,
2292 2), BRW_REGISTER_TYPE_UD),
2293 brw_imm_ud(offset));
2294 brw_pop_insn_state(p);
2295
2296 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2297
2298 brw_inst_set_sfid(devinfo, insn, target_cache);
2299
2300 /* cast dest to a uword[8] vector */
2301 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2302
2303 brw_set_dest(p, insn, dest);
2304 if (devinfo->gen >= 6) {
2305 brw_set_src0(p, insn, mrf);
2306 } else {
2307 brw_set_src0(p, insn, brw_null_reg());
2308 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2309 }
2310
2311 brw_set_desc(p, insn,
2312 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2313 brw_dp_read_desc(devinfo, bind_table_index,
2314 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2315 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2316 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2317
2318 brw_pop_insn_state(p);
2319 }
2320
2321 brw_inst *
2322 brw_fb_WRITE(struct brw_codegen *p,
2323 struct brw_reg payload,
2324 struct brw_reg implied_header,
2325 unsigned msg_control,
2326 unsigned binding_table_index,
2327 unsigned msg_length,
2328 unsigned response_length,
2329 bool eot,
2330 bool last_render_target,
2331 bool header_present)
2332 {
2333 const struct gen_device_info *devinfo = p->devinfo;
2334 const unsigned target_cache =
2335 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2336 BRW_SFID_DATAPORT_WRITE);
2337 brw_inst *insn;
2338 unsigned msg_type;
2339 struct brw_reg dest, src0;
2340
2341 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2342 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2343 else
2344 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2345
2346 if (devinfo->gen >= 6) {
2347 insn = next_insn(p, BRW_OPCODE_SENDC);
2348 } else {
2349 insn = next_insn(p, BRW_OPCODE_SEND);
2350 }
2351 brw_inst_set_sfid(devinfo, insn, target_cache);
2352 brw_inst_set_compression(devinfo, insn, false);
2353
2354 if (devinfo->gen >= 6) {
2355 /* headerless version, just submit color payload */
2356 src0 = payload;
2357
2358 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2359 } else {
2360 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2361 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2362 src0 = implied_header;
2363
2364 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2365 }
2366
2367 brw_set_dest(p, insn, dest);
2368 brw_set_src0(p, insn, src0);
2369 brw_set_desc(p, insn,
2370 brw_message_desc(devinfo, msg_length, response_length,
2371 header_present) |
2372 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2373 msg_type, last_render_target,
2374 0 /* send_commit_msg */));
2375 brw_inst_set_eot(devinfo, insn, eot);
2376
2377 return insn;
2378 }
2379
2380 brw_inst *
2381 gen9_fb_READ(struct brw_codegen *p,
2382 struct brw_reg dst,
2383 struct brw_reg payload,
2384 unsigned binding_table_index,
2385 unsigned msg_length,
2386 unsigned response_length,
2387 bool per_sample)
2388 {
2389 const struct gen_device_info *devinfo = p->devinfo;
2390 assert(devinfo->gen >= 9);
2391 const unsigned msg_subtype =
2392 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2393 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2394
2395 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2396 brw_set_dest(p, insn, dst);
2397 brw_set_src0(p, insn, payload);
2398 brw_set_desc(
2399 p, insn,
2400 brw_message_desc(devinfo, msg_length, response_length, true) |
2401 brw_dp_read_desc(devinfo, binding_table_index,
2402 per_sample << 5 | msg_subtype,
2403 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2404 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2405 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2406
2407 return insn;
2408 }
2409
2410 /**
2411 * Texture sample instruction.
2412 * Note: the msg_type plus msg_length values determine exactly what kind
2413 * of sampling operation is performed. See volume 4, page 161 of docs.
2414 */
2415 void brw_SAMPLE(struct brw_codegen *p,
2416 struct brw_reg dest,
2417 unsigned msg_reg_nr,
2418 struct brw_reg src0,
2419 unsigned binding_table_index,
2420 unsigned sampler,
2421 unsigned msg_type,
2422 unsigned response_length,
2423 unsigned msg_length,
2424 unsigned header_present,
2425 unsigned simd_mode,
2426 unsigned return_format)
2427 {
2428 const struct gen_device_info *devinfo = p->devinfo;
2429 brw_inst *insn;
2430
2431 if (msg_reg_nr != -1)
2432 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2433
2434 insn = next_insn(p, BRW_OPCODE_SEND);
2435 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2436 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2437
2438 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2439 *
2440 * "Instruction compression is not allowed for this instruction (that
2441 * is, send). The hardware behavior is undefined if this instruction is
2442 * set as compressed. However, compress control can be set to "SecHalf"
2443 * to affect the EMask generation."
2444 *
2445 * No similar wording is found in later PRMs, but there are examples
2446 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2447 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2448 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2449 */
2450 brw_inst_set_compression(devinfo, insn, false);
2451
2452 if (devinfo->gen < 6)
2453 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2454
2455 brw_set_dest(p, insn, dest);
2456 brw_set_src0(p, insn, src0);
2457 brw_set_desc(p, insn,
2458 brw_message_desc(devinfo, msg_length, response_length,
2459 header_present) |
2460 brw_sampler_desc(devinfo, binding_table_index, sampler,
2461 msg_type, simd_mode, return_format));
2462 }
2463
2464 /* Adjust the message header's sampler state pointer to
2465 * select the correct group of 16 samplers.
2466 */
2467 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2468 struct brw_reg header,
2469 struct brw_reg sampler_index)
2470 {
2471 /* The "Sampler Index" field can only store values between 0 and 15.
2472 * However, we can add an offset to the "Sampler State Pointer"
2473 * field, effectively selecting a different set of 16 samplers.
2474 *
2475 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2476 * offset, and each sampler state is only 16-bytes, so we can't
2477 * exclusively use the offset - we have to use both.
2478 */
2479
2480 const struct gen_device_info *devinfo = p->devinfo;
2481
2482 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2483 const int sampler_state_size = 16; /* 16 bytes */
2484 uint32_t sampler = sampler_index.ud;
2485
2486 if (sampler >= 16) {
2487 assert(devinfo->is_haswell || devinfo->gen >= 8);
2488 brw_ADD(p,
2489 get_element_ud(header, 3),
2490 get_element_ud(brw_vec8_grf(0, 0), 3),
2491 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2492 }
2493 } else {
2494 /* Non-const sampler array indexing case */
2495 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2496 return;
2497 }
2498
2499 struct brw_reg temp = get_element_ud(header, 3);
2500
2501 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2502 brw_SHL(p, temp, temp, brw_imm_ud(4));
2503 brw_ADD(p,
2504 get_element_ud(header, 3),
2505 get_element_ud(brw_vec8_grf(0, 0), 3),
2506 temp);
2507 }
2508 }
2509
2510 /* All these variables are pretty confusing - we might be better off
2511 * using bitmasks and macros for this, in the old style. Or perhaps
2512 * just having the caller instantiate the fields in dword3 itself.
2513 */
2514 void brw_urb_WRITE(struct brw_codegen *p,
2515 struct brw_reg dest,
2516 unsigned msg_reg_nr,
2517 struct brw_reg src0,
2518 enum brw_urb_write_flags flags,
2519 unsigned msg_length,
2520 unsigned response_length,
2521 unsigned offset,
2522 unsigned swizzle)
2523 {
2524 const struct gen_device_info *devinfo = p->devinfo;
2525 brw_inst *insn;
2526
2527 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2528
2529 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2530 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2531 brw_push_insn_state(p);
2532 brw_set_default_access_mode(p, BRW_ALIGN_1);
2533 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2534 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2535 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2536 BRW_REGISTER_TYPE_UD),
2537 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2538 brw_imm_ud(0xff00));
2539 brw_pop_insn_state(p);
2540 }
2541
2542 insn = next_insn(p, BRW_OPCODE_SEND);
2543
2544 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2545
2546 brw_set_dest(p, insn, dest);
2547 brw_set_src0(p, insn, src0);
2548 brw_set_src1(p, insn, brw_imm_d(0));
2549
2550 if (devinfo->gen < 6)
2551 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2552
2553 brw_set_urb_message(p,
2554 insn,
2555 flags,
2556 msg_length,
2557 response_length,
2558 offset,
2559 swizzle);
2560 }
2561
2562 void
2563 brw_send_indirect_message(struct brw_codegen *p,
2564 unsigned sfid,
2565 struct brw_reg dst,
2566 struct brw_reg payload,
2567 struct brw_reg desc,
2568 unsigned desc_imm,
2569 bool eot)
2570 {
2571 const struct gen_device_info *devinfo = p->devinfo;
2572 struct brw_inst *send;
2573
2574 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2575
2576 assert(desc.type == BRW_REGISTER_TYPE_UD);
2577
2578 if (desc.file == BRW_IMMEDIATE_VALUE) {
2579 send = next_insn(p, BRW_OPCODE_SEND);
2580 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2581 brw_set_desc(p, send, desc.ud | desc_imm);
2582 } else {
2583 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2584
2585 brw_push_insn_state(p);
2586 brw_set_default_access_mode(p, BRW_ALIGN_1);
2587 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2588 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2589 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2590
2591 /* Load the indirect descriptor to an address register using OR so the
2592 * caller can specify additional descriptor bits with the desc_imm
2593 * immediate.
2594 */
2595 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2596
2597 brw_pop_insn_state(p);
2598
2599 send = next_insn(p, BRW_OPCODE_SEND);
2600 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2601 brw_set_src1(p, send, addr);
2602 }
2603
2604 brw_set_dest(p, send, dst);
2605 brw_inst_set_sfid(devinfo, send, sfid);
2606 brw_inst_set_eot(devinfo, send, eot);
2607 }
2608
2609 void
2610 brw_send_indirect_split_message(struct brw_codegen *p,
2611 unsigned sfid,
2612 struct brw_reg dst,
2613 struct brw_reg payload0,
2614 struct brw_reg payload1,
2615 struct brw_reg desc,
2616 unsigned desc_imm,
2617 struct brw_reg ex_desc,
2618 unsigned ex_desc_imm,
2619 bool eot)
2620 {
2621 const struct gen_device_info *devinfo = p->devinfo;
2622 struct brw_inst *send;
2623
2624 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2625
2626 assert(desc.type == BRW_REGISTER_TYPE_UD);
2627
2628 if (desc.file == BRW_IMMEDIATE_VALUE) {
2629 desc.ud |= desc_imm;
2630 } else {
2631 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2632
2633 brw_push_insn_state(p);
2634 brw_set_default_access_mode(p, BRW_ALIGN_1);
2635 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2636 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2637 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2638
2639 /* Load the indirect descriptor to an address register using OR so the
2640 * caller can specify additional descriptor bits with the desc_imm
2641 * immediate.
2642 */
2643 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2644
2645 brw_pop_insn_state(p);
2646 desc = addr;
2647 }
2648
2649 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2650 (ex_desc.ud & INTEL_MASK(15, 12)) == 0) {
2651 ex_desc.ud |= ex_desc_imm;
2652 } else {
2653 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2654
2655 brw_push_insn_state(p);
2656 brw_set_default_access_mode(p, BRW_ALIGN_1);
2657 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2658 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2659 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2660
2661 /* Load the indirect extended descriptor to an address register using OR
2662 * so the caller can specify additional descriptor bits with the
2663 * desc_imm immediate.
2664 *
2665 * Even though the instruction dispatcher always pulls the SFID and EOT
2666 * fields from the instruction itself, actual external unit which
2667 * processes the message gets the SFID and EOT from the extended
2668 * descriptor which comes from the address register. If we don't OR
2669 * those two bits in, the external unit may get confused and hang.
2670 */
2671 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2672
2673 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2674 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2675 * we may have fallen back to an indirect extended descriptor.
2676 */
2677 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2678 } else {
2679 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2680 }
2681
2682 brw_pop_insn_state(p);
2683 ex_desc = addr;
2684 }
2685
2686 send = next_insn(p, BRW_OPCODE_SENDS);
2687 brw_set_dest(p, send, dst);
2688 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2689 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2690
2691 if (desc.file == BRW_IMMEDIATE_VALUE) {
2692 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2693 brw_inst_set_send_desc(devinfo, send, desc.ud);
2694 } else {
2695 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2696 assert(desc.nr == BRW_ARF_ADDRESS);
2697 assert(desc.subnr == 0);
2698 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2699 }
2700
2701 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2702 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2703 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2704 } else {
2705 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2706 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2707 assert((ex_desc.subnr & 0x3) == 0);
2708 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2709 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2710 }
2711
2712 brw_inst_set_sfid(devinfo, send, sfid);
2713 brw_inst_set_eot(devinfo, send, eot);
2714 }
2715
2716 static void
2717 brw_send_indirect_surface_message(struct brw_codegen *p,
2718 unsigned sfid,
2719 struct brw_reg dst,
2720 struct brw_reg payload,
2721 struct brw_reg surface,
2722 unsigned desc_imm)
2723 {
2724 if (surface.file != BRW_IMMEDIATE_VALUE) {
2725 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2726
2727 brw_push_insn_state(p);
2728 brw_set_default_access_mode(p, BRW_ALIGN_1);
2729 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2730 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2731 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2732
2733 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2734 * some surface array is accessed out of bounds.
2735 */
2736 brw_AND(p, addr,
2737 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2738 BRW_GET_SWZ(surface.swizzle, 0)),
2739 brw_imm_ud(0xff));
2740
2741 brw_pop_insn_state(p);
2742
2743 surface = addr;
2744 }
2745
2746 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2747 }
2748
2749 static bool
2750 while_jumps_before_offset(const struct gen_device_info *devinfo,
2751 brw_inst *insn, int while_offset, int start_offset)
2752 {
2753 int scale = 16 / brw_jump_scale(devinfo);
2754 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2755 : brw_inst_jip(devinfo, insn);
2756 assert(jip < 0);
2757 return while_offset + jip * scale <= start_offset;
2758 }
2759
2760
2761 static int
2762 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2763 {
2764 int offset;
2765 void *store = p->store;
2766 const struct gen_device_info *devinfo = p->devinfo;
2767
2768 int depth = 0;
2769
2770 for (offset = next_offset(devinfo, store, start_offset);
2771 offset < p->next_insn_offset;
2772 offset = next_offset(devinfo, store, offset)) {
2773 brw_inst *insn = store + offset;
2774
2775 switch (brw_inst_opcode(devinfo, insn)) {
2776 case BRW_OPCODE_IF:
2777 depth++;
2778 break;
2779 case BRW_OPCODE_ENDIF:
2780 if (depth == 0)
2781 return offset;
2782 depth--;
2783 break;
2784 case BRW_OPCODE_WHILE:
2785 /* If the while doesn't jump before our instruction, it's the end
2786 * of a sibling do...while loop. Ignore it.
2787 */
2788 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2789 continue;
2790 /* fallthrough */
2791 case BRW_OPCODE_ELSE:
2792 case BRW_OPCODE_HALT:
2793 if (depth == 0)
2794 return offset;
2795 default:
2796 break;
2797 }
2798 }
2799
2800 return 0;
2801 }
2802
2803 /* There is no DO instruction on gen6, so to find the end of the loop
2804 * we have to see if the loop is jumping back before our start
2805 * instruction.
2806 */
2807 static int
2808 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2809 {
2810 const struct gen_device_info *devinfo = p->devinfo;
2811 int offset;
2812 void *store = p->store;
2813
2814 assert(devinfo->gen >= 6);
2815
2816 /* Always start after the instruction (such as a WHILE) we're trying to fix
2817 * up.
2818 */
2819 for (offset = next_offset(devinfo, store, start_offset);
2820 offset < p->next_insn_offset;
2821 offset = next_offset(devinfo, store, offset)) {
2822 brw_inst *insn = store + offset;
2823
2824 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2825 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2826 return offset;
2827 }
2828 }
2829 assert(!"not reached");
2830 return start_offset;
2831 }
2832
2833 /* After program generation, go back and update the UIP and JIP of
2834 * BREAK, CONT, and HALT instructions to their correct locations.
2835 */
2836 void
2837 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2838 {
2839 const struct gen_device_info *devinfo = p->devinfo;
2840 int offset;
2841 int br = brw_jump_scale(devinfo);
2842 int scale = 16 / br;
2843 void *store = p->store;
2844
2845 if (devinfo->gen < 6)
2846 return;
2847
2848 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2849 brw_inst *insn = store + offset;
2850 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2851
2852 int block_end_offset = brw_find_next_block_end(p, offset);
2853 switch (brw_inst_opcode(devinfo, insn)) {
2854 case BRW_OPCODE_BREAK:
2855 assert(block_end_offset != 0);
2856 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2857 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2858 brw_inst_set_uip(devinfo, insn,
2859 (brw_find_loop_end(p, offset) - offset +
2860 (devinfo->gen == 6 ? 16 : 0)) / scale);
2861 break;
2862 case BRW_OPCODE_CONTINUE:
2863 assert(block_end_offset != 0);
2864 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2865 brw_inst_set_uip(devinfo, insn,
2866 (brw_find_loop_end(p, offset) - offset) / scale);
2867
2868 assert(brw_inst_uip(devinfo, insn) != 0);
2869 assert(brw_inst_jip(devinfo, insn) != 0);
2870 break;
2871
2872 case BRW_OPCODE_ENDIF: {
2873 int32_t jump = (block_end_offset == 0) ?
2874 1 * br : (block_end_offset - offset) / scale;
2875 if (devinfo->gen >= 7)
2876 brw_inst_set_jip(devinfo, insn, jump);
2877 else
2878 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2879 break;
2880 }
2881
2882 case BRW_OPCODE_HALT:
2883 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2884 *
2885 * "In case of the halt instruction not inside any conditional
2886 * code block, the value of <JIP> and <UIP> should be the
2887 * same. In case of the halt instruction inside conditional code
2888 * block, the <UIP> should be the end of the program, and the
2889 * <JIP> should be end of the most inner conditional code block."
2890 *
2891 * The uip will have already been set by whoever set up the
2892 * instruction.
2893 */
2894 if (block_end_offset == 0) {
2895 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2896 } else {
2897 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2898 }
2899 assert(brw_inst_uip(devinfo, insn) != 0);
2900 assert(brw_inst_jip(devinfo, insn) != 0);
2901 break;
2902
2903 default:
2904 break;
2905 }
2906 }
2907 }
2908
2909 void brw_ff_sync(struct brw_codegen *p,
2910 struct brw_reg dest,
2911 unsigned msg_reg_nr,
2912 struct brw_reg src0,
2913 bool allocate,
2914 unsigned response_length,
2915 bool eot)
2916 {
2917 const struct gen_device_info *devinfo = p->devinfo;
2918 brw_inst *insn;
2919
2920 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2921
2922 insn = next_insn(p, BRW_OPCODE_SEND);
2923 brw_set_dest(p, insn, dest);
2924 brw_set_src0(p, insn, src0);
2925 brw_set_src1(p, insn, brw_imm_d(0));
2926
2927 if (devinfo->gen < 6)
2928 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2929
2930 brw_set_ff_sync_message(p,
2931 insn,
2932 allocate,
2933 response_length,
2934 eot);
2935 }
2936
2937 /**
2938 * Emit the SEND instruction necessary to generate stream output data on Gen6
2939 * (for transform feedback).
2940 *
2941 * If send_commit_msg is true, this is the last piece of stream output data
2942 * from this thread, so send the data as a committed write. According to the
2943 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2944 *
2945 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2946 * writes are complete by sending the final write as a committed write."
2947 */
2948 void
2949 brw_svb_write(struct brw_codegen *p,
2950 struct brw_reg dest,
2951 unsigned msg_reg_nr,
2952 struct brw_reg src0,
2953 unsigned binding_table_index,
2954 bool send_commit_msg)
2955 {
2956 const struct gen_device_info *devinfo = p->devinfo;
2957 const unsigned target_cache =
2958 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2959 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2960 BRW_SFID_DATAPORT_WRITE);
2961 brw_inst *insn;
2962
2963 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2964
2965 insn = next_insn(p, BRW_OPCODE_SEND);
2966 brw_inst_set_sfid(devinfo, insn, target_cache);
2967 brw_set_dest(p, insn, dest);
2968 brw_set_src0(p, insn, src0);
2969 brw_set_desc(p, insn,
2970 brw_message_desc(devinfo, 1, send_commit_msg, true) |
2971 brw_dp_write_desc(devinfo, binding_table_index,
2972 0, /* msg_control: ignored */
2973 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2974 0, /* last_render_target: ignored */
2975 send_commit_msg)); /* send_commit_msg */
2976 }
2977
2978 static unsigned
2979 brw_surface_payload_size(struct brw_codegen *p,
2980 unsigned num_channels,
2981 unsigned exec_size /**< 0 for SIMD4x2 */)
2982 {
2983 if (exec_size == 0)
2984 return 1; /* SIMD4x2 */
2985 else if (exec_size <= 8)
2986 return num_channels;
2987 else
2988 return 2 * num_channels;
2989 }
2990
2991 void
2992 brw_untyped_atomic(struct brw_codegen *p,
2993 struct brw_reg dst,
2994 struct brw_reg payload,
2995 struct brw_reg surface,
2996 unsigned atomic_op,
2997 unsigned msg_length,
2998 bool response_expected,
2999 bool header_present)
3000 {
3001 const struct gen_device_info *devinfo = p->devinfo;
3002 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3003 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3004 GEN7_SFID_DATAPORT_DATA_CACHE);
3005 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3006 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3007 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3008 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3009 has_simd4x2 ? 0 : 8;
3010 const unsigned response_length =
3011 brw_surface_payload_size(p, response_expected, exec_size);
3012 const unsigned desc =
3013 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3014 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3015 response_expected);
3016 /* Mask out unused components -- This is especially important in Align16
3017 * mode on generations that don't have native support for SIMD4x2 atomics,
3018 * because unused but enabled components will cause the dataport to perform
3019 * additional atomic operations on the addresses that happen to be in the
3020 * uninitialized Y, Z and W coordinates of the payload.
3021 */
3022 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3023
3024 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3025 payload, surface, desc);
3026 }
3027
3028 void
3029 brw_untyped_surface_read(struct brw_codegen *p,
3030 struct brw_reg dst,
3031 struct brw_reg payload,
3032 struct brw_reg surface,
3033 unsigned msg_length,
3034 unsigned num_channels)
3035 {
3036 const struct gen_device_info *devinfo = p->devinfo;
3037 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3038 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3039 GEN7_SFID_DATAPORT_DATA_CACHE);
3040 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3041 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3042 const unsigned response_length =
3043 brw_surface_payload_size(p, num_channels, exec_size);
3044 const unsigned desc =
3045 brw_message_desc(devinfo, msg_length, response_length, false) |
3046 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3047
3048 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3049 }
3050
3051 void
3052 brw_untyped_surface_write(struct brw_codegen *p,
3053 struct brw_reg payload,
3054 struct brw_reg surface,
3055 unsigned msg_length,
3056 unsigned num_channels,
3057 bool header_present)
3058 {
3059 const struct gen_device_info *devinfo = p->devinfo;
3060 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3061 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3062 GEN7_SFID_DATAPORT_DATA_CACHE);
3063 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3064 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3065 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3066 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3067 has_simd4x2 ? 0 : 8;
3068 const unsigned desc =
3069 brw_message_desc(devinfo, msg_length, 0, header_present) |
3070 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3071 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3072 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3073
3074 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3075 payload, surface, desc);
3076 }
3077
3078 static void
3079 brw_set_memory_fence_message(struct brw_codegen *p,
3080 struct brw_inst *insn,
3081 enum brw_message_target sfid,
3082 bool commit_enable,
3083 unsigned bti)
3084 {
3085 const struct gen_device_info *devinfo = p->devinfo;
3086
3087 brw_set_desc(p, insn, brw_message_desc(
3088 devinfo, 1, (commit_enable ? 1 : 0), true));
3089
3090 brw_inst_set_sfid(devinfo, insn, sfid);
3091
3092 switch (sfid) {
3093 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3094 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3095 break;
3096 case GEN7_SFID_DATAPORT_DATA_CACHE:
3097 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3098 break;
3099 default:
3100 unreachable("Not reached");
3101 }
3102
3103 if (commit_enable)
3104 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3105
3106 assert(devinfo->gen >= 11 || bti == 0);
3107 brw_inst_set_binding_table_index(devinfo, insn, bti);
3108 }
3109
3110 void
3111 brw_memory_fence(struct brw_codegen *p,
3112 struct brw_reg dst,
3113 struct brw_reg src,
3114 enum opcode send_op,
3115 bool stall,
3116 unsigned bti)
3117 {
3118 const struct gen_device_info *devinfo = p->devinfo;
3119 const bool commit_enable = stall ||
3120 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3121 (devinfo->gen == 7 && !devinfo->is_haswell);
3122 struct brw_inst *insn;
3123
3124 brw_push_insn_state(p);
3125 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3126 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3127 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3128 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3129
3130 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3131 * message doesn't write anything back.
3132 */
3133 insn = next_insn(p, send_op);
3134 brw_set_dest(p, insn, dst);
3135 brw_set_src0(p, insn, src);
3136 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3137 commit_enable, bti);
3138
3139 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3140 /* IVB does typed surface access through the render cache, so we need to
3141 * flush it too. Use a different register so both flushes can be
3142 * pipelined by the hardware.
3143 */
3144 insn = next_insn(p, send_op);
3145 brw_set_dest(p, insn, offset(dst, 1));
3146 brw_set_src0(p, insn, src);
3147 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3148 commit_enable, bti);
3149
3150 /* Now write the response of the second message into the response of the
3151 * first to trigger a pipeline stall -- This way future render and data
3152 * cache messages will be properly ordered with respect to past data and
3153 * render cache messages.
3154 */
3155 brw_MOV(p, dst, offset(dst, 1));
3156 }
3157
3158 if (stall)
3159 brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
3160
3161 brw_pop_insn_state(p);
3162 }
3163
3164 void
3165 brw_pixel_interpolator_query(struct brw_codegen *p,
3166 struct brw_reg dest,
3167 struct brw_reg mrf,
3168 bool noperspective,
3169 unsigned mode,
3170 struct brw_reg data,
3171 unsigned msg_length,
3172 unsigned response_length)
3173 {
3174 const struct gen_device_info *devinfo = p->devinfo;
3175 const uint16_t exec_size = brw_get_default_exec_size(p);
3176 const unsigned slot_group = brw_get_default_group(p) / 16;
3177 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3178 const unsigned desc =
3179 brw_message_desc(devinfo, msg_length, response_length, false) |
3180 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3181 slot_group);
3182
3183 /* brw_send_indirect_message will automatically use a direct send message
3184 * if data is actually immediate.
3185 */
3186 brw_send_indirect_message(p,
3187 GEN7_SFID_PIXEL_INTERPOLATOR,
3188 dest,
3189 mrf,
3190 vec1(data),
3191 desc,
3192 false);
3193 }
3194
3195 void
3196 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3197 struct brw_reg mask)
3198 {
3199 const struct gen_device_info *devinfo = p->devinfo;
3200 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3201 const unsigned qtr_control = brw_get_default_group(p) / 8;
3202 brw_inst *inst;
3203
3204 assert(devinfo->gen >= 7);
3205 assert(mask.type == BRW_REGISTER_TYPE_UD);
3206
3207 brw_push_insn_state(p);
3208
3209 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3210 * unnecessary bits in the instruction words, get the information we need
3211 * and reset the default flag register. This allows more instructions to be
3212 * compacted.
3213 */
3214 const unsigned flag_subreg = p->current->flag_subreg;
3215 brw_set_default_flag_reg(p, 0, 0);
3216
3217 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3218 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3219
3220 if (devinfo->gen >= 8) {
3221 /* Getting the first active channel index is easy on Gen8: Just find
3222 * the first bit set in the execution mask. The register exists on
3223 * HSW already but it reads back as all ones when the current
3224 * instruction has execution masking disabled, so it's kind of
3225 * useless.
3226 */
3227 struct brw_reg exec_mask =
3228 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3229
3230 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3231 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3232 /* Unfortunately, ce0 does not take into account the thread
3233 * dispatch mask, which may be a problem in cases where it's not
3234 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3235 * some n). Combine ce0 with the given dispatch (or vector) mask
3236 * to mask off those channels which were never dispatched by the
3237 * hardware.
3238 */
3239 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3240 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3241 exec_mask = vec1(dst);
3242 }
3243
3244 /* Quarter control has the effect of magically shifting the value of
3245 * ce0 so you'll get the first active channel relative to the
3246 * specified quarter control as result.
3247 */
3248 inst = brw_FBL(p, vec1(dst), exec_mask);
3249 } else {
3250 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3251
3252 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3253 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3254
3255 /* Run enough instructions returning zero with execution masking and
3256 * a conditional modifier enabled in order to get the full execution
3257 * mask in f1.0. We could use a single 32-wide move here if it
3258 * weren't because of the hardware bug that causes channel enables to
3259 * be applied incorrectly to the second half of 32-wide instructions
3260 * on Gen7.
3261 */
3262 const unsigned lower_size = MIN2(16, exec_size);
3263 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3264 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3265 brw_imm_uw(0));
3266 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3267 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3268 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3269 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3270 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3271 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3272 }
3273
3274 /* Find the first bit set in the exec_size-wide portion of the flag
3275 * register that was updated by the last sequence of MOV
3276 * instructions.
3277 */
3278 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3279 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3280 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3281 }
3282 } else {
3283 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3284
3285 if (devinfo->gen >= 8 &&
3286 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3287 /* In SIMD4x2 mode the first active channel index is just the
3288 * negation of the first bit of the mask register. Note that ce0
3289 * doesn't take into account the dispatch mask, so the Gen7 path
3290 * should be used instead unless you have the guarantee that the
3291 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3292 * for some n).
3293 */
3294 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3295 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3296 brw_imm_ud(1));
3297
3298 } else {
3299 /* Overwrite the destination without and with execution masking to
3300 * find out which of the channels is active.
3301 */
3302 brw_push_insn_state(p);
3303 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3304 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3305 brw_imm_ud(1));
3306
3307 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3308 brw_imm_ud(0));
3309 brw_pop_insn_state(p);
3310 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3311 }
3312 }
3313
3314 brw_pop_insn_state(p);
3315 }
3316
3317 void
3318 brw_broadcast(struct brw_codegen *p,
3319 struct brw_reg dst,
3320 struct brw_reg src,
3321 struct brw_reg idx)
3322 {
3323 const struct gen_device_info *devinfo = p->devinfo;
3324 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3325 brw_inst *inst;
3326
3327 brw_push_insn_state(p);
3328 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3329 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3330
3331 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3332 src.address_mode == BRW_ADDRESS_DIRECT);
3333 assert(!src.abs && !src.negate);
3334 assert(src.type == dst.type);
3335
3336 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3337 idx.file == BRW_IMMEDIATE_VALUE) {
3338 /* Trivial, the source is already uniform or the index is a constant.
3339 * We will typically not get here if the optimizer is doing its job, but
3340 * asserting would be mean.
3341 */
3342 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3343 brw_MOV(p, dst,
3344 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3345 stride(suboffset(src, 4 * i), 0, 4, 1)));
3346 } else {
3347 /* From the Haswell PRM section "Register Region Restrictions":
3348 *
3349 * "The lower bits of the AddressImmediate must not overflow to
3350 * change the register address. The lower 5 bits of Address
3351 * Immediate when added to lower 5 bits of address register gives
3352 * the sub-register offset. The upper bits of Address Immediate
3353 * when added to upper bits of address register gives the register
3354 * address. Any overflow from sub-register offset is dropped."
3355 *
3356 * Fortunately, for broadcast, we never have a sub-register offset so
3357 * this isn't an issue.
3358 */
3359 assert(src.subnr == 0);
3360
3361 if (align1) {
3362 const struct brw_reg addr =
3363 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3364 unsigned offset = src.nr * REG_SIZE + src.subnr;
3365 /* Limit in bytes of the signed indirect addressing immediate. */
3366 const unsigned limit = 512;
3367
3368 brw_push_insn_state(p);
3369 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3370 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3371
3372 /* Take into account the component size and horizontal stride. */
3373 assert(src.vstride == src.hstride + src.width);
3374 brw_SHL(p, addr, vec1(idx),
3375 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3376 src.hstride - 1));
3377
3378 /* We can only address up to limit bytes using the indirect
3379 * addressing immediate, account for the difference if the source
3380 * register is above this limit.
3381 */
3382 if (offset >= limit) {
3383 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3384 offset = offset % limit;
3385 }
3386
3387 brw_pop_insn_state(p);
3388
3389 /* Use indirect addressing to fetch the specified component. */
3390 if (type_sz(src.type) > 4 &&
3391 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3392 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3393 *
3394 * "When source or destination datatype is 64b or operation is
3395 * integer DWord multiply, indirect addressing must not be
3396 * used."
3397 *
3398 * To work around both of this issue, we do two integer MOVs
3399 * insead of one 64-bit MOV. Because no double value should ever
3400 * cross a register boundary, it's safe to use the immediate
3401 * offset in the indirect here to handle adding 4 bytes to the
3402 * offset and avoid the extra ADD to the register file.
3403 */
3404 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3405 retype(brw_vec1_indirect(addr.subnr, offset),
3406 BRW_REGISTER_TYPE_D));
3407 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3408 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3409 BRW_REGISTER_TYPE_D));
3410 } else {
3411 brw_MOV(p, dst,
3412 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3413 }
3414 } else {
3415 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3416 * to all bits of a flag register,
3417 */
3418 inst = brw_MOV(p,
3419 brw_null_reg(),
3420 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3421 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3422 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3423 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3424
3425 /* and use predicated SEL to pick the right channel. */
3426 inst = brw_SEL(p, dst,
3427 stride(suboffset(src, 4), 4, 4, 1),
3428 stride(src, 4, 4, 1));
3429 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3430 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3431 }
3432 }
3433
3434 brw_pop_insn_state(p);
3435 }
3436
3437 /**
3438 * This instruction is generated as a single-channel align1 instruction by
3439 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3440 *
3441 * We can't use the typed atomic op in the FS because that has the execution
3442 * mask ANDed with the pixel mask, but we just want to write the one dword for
3443 * all the pixels.
3444 *
3445 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3446 * one u32. So we use the same untyped atomic write message as the pixel
3447 * shader.
3448 *
3449 * The untyped atomic operation requires a BUFFER surface type with RAW
3450 * format, and is only accessible through the legacy DATA_CACHE dataport
3451 * messages.
3452 */
3453 void brw_shader_time_add(struct brw_codegen *p,
3454 struct brw_reg payload,
3455 uint32_t surf_index)
3456 {
3457 const struct gen_device_info *devinfo = p->devinfo;
3458 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3459 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3460 GEN7_SFID_DATAPORT_DATA_CACHE);
3461 assert(devinfo->gen >= 7);
3462
3463 brw_push_insn_state(p);
3464 brw_set_default_access_mode(p, BRW_ALIGN_1);
3465 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3466 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3467 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3468
3469 /* We use brw_vec1_reg and unmasked because we want to increment the given
3470 * offset only once.
3471 */
3472 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3473 BRW_ARF_NULL, 0));
3474 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3475 payload.nr, 0));
3476 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3477 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3478 false)));
3479
3480 brw_inst_set_sfid(devinfo, send, sfid);
3481 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3482
3483 brw_pop_insn_state(p);
3484 }
3485
3486
3487 /**
3488 * Emit the SEND message for a barrier
3489 */
3490 void
3491 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3492 {
3493 const struct gen_device_info *devinfo = p->devinfo;
3494 struct brw_inst *inst;
3495
3496 assert(devinfo->gen >= 7);
3497
3498 brw_push_insn_state(p);
3499 brw_set_default_access_mode(p, BRW_ALIGN_1);
3500 inst = next_insn(p, BRW_OPCODE_SEND);
3501 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3502 brw_set_src0(p, inst, src);
3503 brw_set_src1(p, inst, brw_null_reg());
3504 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3505
3506 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3507 brw_inst_set_gateway_notify(devinfo, inst, 1);
3508 brw_inst_set_gateway_subfuncid(devinfo, inst,
3509 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3510
3511 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3512 brw_pop_insn_state(p);
3513 }
3514
3515
3516 /**
3517 * Emit the wait instruction for a barrier
3518 */
3519 void
3520 brw_WAIT(struct brw_codegen *p)
3521 {
3522 const struct gen_device_info *devinfo = p->devinfo;
3523 struct brw_inst *insn;
3524
3525 struct brw_reg src = brw_notification_reg();
3526
3527 insn = next_insn(p, BRW_OPCODE_WAIT);
3528 brw_set_dest(p, insn, src);
3529 brw_set_src0(p, insn, src);
3530 brw_set_src1(p, insn, brw_null_reg());
3531
3532 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3533 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3534 }
3535
3536 void
3537 brw_float_controls_mode(struct brw_codegen *p,
3538 unsigned mode, unsigned mask)
3539 {
3540 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3541 brw_imm_ud(~mask));
3542 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3543
3544 /* From the Skylake PRM, Volume 7, page 760:
3545 * "Implementation Restriction on Register Access: When the control
3546 * register is used as an explicit source and/or destination, hardware
3547 * does not ensure execution pipeline coherency. Software must set the
3548 * thread control field to ‘switch’ for an instruction that uses
3549 * control register as an explicit operand."
3550 */
3551 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3552
3553 if (mode) {
3554 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3555 brw_imm_ud(mode));
3556 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3557 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3558 }
3559 }