intel/compiler: Change src1 reg type to unsigned doubleword
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 gen7_convert_mrf_to_grf(p, &dest);
98
99 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
100 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
101
102 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
103 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
104
105 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
106 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
107 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
108 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
109 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
110 } else {
111 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
112 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
113 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_MESSAGE_REGISTER_FILE) {
115 assert(dest.writemask != 0);
116 }
117 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
118 * Although Dst.HorzStride is a don't care for Align16, HW needs
119 * this to be programmed as "01".
120 */
121 brw_inst_set_dst_hstride(devinfo, inst, 1);
122 }
123 } else {
124 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
125
126 /* These are different sizes in align1 vs align16:
127 */
128 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
129 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
130 dest.indirect_offset);
131 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
134 } else {
135 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
136 dest.indirect_offset);
137 /* even ignored in da16, still need to set as '01' */
138 brw_inst_set_dst_hstride(devinfo, inst, 1);
139 }
140 }
141
142 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
143 * or 16 (SIMD16), as that's normally correct. However, when dealing with
144 * small registers, it can be useful for us to automatically reduce it to
145 * match the register size.
146 */
147 if (p->automatic_exec_sizes) {
148 /*
149 * In platforms that support fp64 we can emit instructions with a width
150 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
151 * these cases we need to make sure that these instructions have their
152 * exec sizes set properly when they are emitted and we can't rely on
153 * this code to fix it.
154 */
155 bool fix_exec_size;
156 if (devinfo->gen >= 6)
157 fix_exec_size = dest.width < BRW_EXECUTE_4;
158 else
159 fix_exec_size = dest.width < BRW_EXECUTE_8;
160
161 if (fix_exec_size)
162 brw_inst_set_exec_size(devinfo, inst, dest.width);
163 }
164 }
165
166 void
167 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
168 {
169 const struct gen_device_info *devinfo = p->devinfo;
170
171 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
172 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
173 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
174 assert(reg.nr < 128);
175
176 gen7_convert_mrf_to_grf(p, &reg);
177
178 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
179 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
180 /* Any source modifiers or regions will be ignored, since this just
181 * identifies the MRF/GRF to start reading the message contents from.
182 * Check for some likely failures.
183 */
184 assert(!reg.negate);
185 assert(!reg.abs);
186 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
187 }
188
189 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
190 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
191 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
192 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
193
194 if (reg.file == BRW_IMMEDIATE_VALUE) {
195 if (reg.type == BRW_REGISTER_TYPE_DF ||
196 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
197 brw_inst_set_imm_df(devinfo, inst, reg.df);
198 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
199 reg.type == BRW_REGISTER_TYPE_Q)
200 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
201 else
202 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
203
204 if (type_sz(reg.type) < 8) {
205 brw_inst_set_src1_reg_file(devinfo, inst,
206 BRW_ARCHITECTURE_REGISTER_FILE);
207 brw_inst_set_src1_reg_hw_type(devinfo, inst,
208 brw_inst_src0_reg_hw_type(devinfo, inst));
209 }
210 } else {
211 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
212 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
213 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
214 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
215 } else {
216 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
217 }
218 } else {
219 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
220
221 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
222 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
223 } else {
224 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
225 }
226 }
227
228 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
229 if (reg.width == BRW_WIDTH_1 &&
230 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
231 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
232 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
233 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
234 } else {
235 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
236 brw_inst_set_src0_width(devinfo, inst, reg.width);
237 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
238 }
239 } else {
240 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
241 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
242 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
243 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
244 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
245 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
246 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
247 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
248
249 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
250 /* This is an oddity of the fact we're using the same
251 * descriptions for registers in align_16 as align_1:
252 */
253 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
254 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
255 reg.type == BRW_REGISTER_TYPE_DF &&
256 reg.vstride == BRW_VERTICAL_STRIDE_2) {
257 /* From SNB PRM:
258 *
259 * "For Align16 access mode, only encodings of 0000 and 0011
260 * are allowed. Other codes are reserved."
261 *
262 * Presumably the DevSNB behavior applies to IVB as well.
263 */
264 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
265 } else {
266 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
267 }
268 }
269 }
270 }
271
272
273 void
274 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
275 {
276 const struct gen_device_info *devinfo = p->devinfo;
277
278 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
279 assert(reg.nr < 128);
280
281 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
282 *
283 * "Accumulator registers may be accessed explicitly as src0
284 * operands only."
285 */
286 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
287 reg.nr != BRW_ARF_ACCUMULATOR);
288
289 gen7_convert_mrf_to_grf(p, &reg);
290 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
291
292 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
293 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
294 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
295
296 /* Only src1 can be immediate in two-argument instructions.
297 */
298 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
299
300 if (reg.file == BRW_IMMEDIATE_VALUE) {
301 /* two-argument instructions can only use 32-bit immediates */
302 assert(type_sz(reg.type) < 8);
303 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
304 } else {
305 /* This is a hardware restriction, which may or may not be lifted
306 * in the future:
307 */
308 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
309 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
310
311 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
312 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
313 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
314 } else {
315 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
316 }
317
318 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
319 if (reg.width == BRW_WIDTH_1 &&
320 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
321 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
322 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
323 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
324 } else {
325 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
326 brw_inst_set_src1_width(devinfo, inst, reg.width);
327 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
328 }
329 } else {
330 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
331 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
332 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
333 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
334 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
335 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
336 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
337 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
338
339 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
340 /* This is an oddity of the fact we're using the same
341 * descriptions for registers in align_16 as align_1:
342 */
343 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
344 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
345 reg.type == BRW_REGISTER_TYPE_DF &&
346 reg.vstride == BRW_VERTICAL_STRIDE_2) {
347 /* From SNB PRM:
348 *
349 * "For Align16 access mode, only encodings of 0000 and 0011
350 * are allowed. Other codes are reserved."
351 *
352 * Presumably the DevSNB behavior applies to IVB as well.
353 */
354 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
355 } else {
356 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
357 }
358 }
359 }
360 }
361
362 /**
363 * Specify the descriptor and extended descriptor immediate for a SEND(C)
364 * message instruction.
365 */
366 void
367 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
368 unsigned desc, unsigned ex_desc)
369 {
370 const struct gen_device_info *devinfo = p->devinfo;
371 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
372 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
373 brw_inst_set_src1_file_type(devinfo, inst,
374 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
375 brw_inst_set_send_desc(devinfo, inst, desc);
376 if (devinfo->gen >= 9)
377 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
378 }
379
380 static void brw_set_math_message( struct brw_codegen *p,
381 brw_inst *inst,
382 unsigned function,
383 unsigned integer_type,
384 bool low_precision,
385 unsigned dataType )
386 {
387 const struct gen_device_info *devinfo = p->devinfo;
388 unsigned msg_length;
389 unsigned response_length;
390
391 /* Infer message length from the function */
392 switch (function) {
393 case BRW_MATH_FUNCTION_POW:
394 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
395 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
396 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
397 msg_length = 2;
398 break;
399 default:
400 msg_length = 1;
401 break;
402 }
403
404 /* Infer response length from the function */
405 switch (function) {
406 case BRW_MATH_FUNCTION_SINCOS:
407 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
408 response_length = 2;
409 break;
410 default:
411 response_length = 1;
412 break;
413 }
414
415 brw_set_desc(p, inst, brw_message_desc(
416 devinfo, msg_length, response_length, false));
417
418 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
419 brw_inst_set_math_msg_function(devinfo, inst, function);
420 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
421 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
422 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
423 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
424 brw_inst_set_saturate(devinfo, inst, 0);
425 }
426
427
428 static void brw_set_ff_sync_message(struct brw_codegen *p,
429 brw_inst *insn,
430 bool allocate,
431 unsigned response_length,
432 bool end_of_thread)
433 {
434 const struct gen_device_info *devinfo = p->devinfo;
435
436 brw_set_desc(p, insn, brw_message_desc(
437 devinfo, 1, response_length, true));
438
439 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
440 brw_inst_set_eot(devinfo, insn, end_of_thread);
441 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
442 brw_inst_set_urb_allocate(devinfo, insn, allocate);
443 /* The following fields are not used by FF_SYNC: */
444 brw_inst_set_urb_global_offset(devinfo, insn, 0);
445 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
446 brw_inst_set_urb_used(devinfo, insn, 0);
447 brw_inst_set_urb_complete(devinfo, insn, 0);
448 }
449
450 static void brw_set_urb_message( struct brw_codegen *p,
451 brw_inst *insn,
452 enum brw_urb_write_flags flags,
453 unsigned msg_length,
454 unsigned response_length,
455 unsigned offset,
456 unsigned swizzle_control )
457 {
458 const struct gen_device_info *devinfo = p->devinfo;
459
460 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
461 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
462 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
463
464 brw_set_desc(p, insn, brw_message_desc(
465 devinfo, msg_length, response_length, true));
466
467 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
468 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
469
470 if (flags & BRW_URB_WRITE_OWORD) {
471 assert(msg_length == 2); /* header + one OWORD of data */
472 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
473 } else {
474 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
475 }
476
477 brw_inst_set_urb_global_offset(devinfo, insn, offset);
478 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
479
480 if (devinfo->gen < 8) {
481 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
482 }
483
484 if (devinfo->gen < 7) {
485 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
486 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
487 } else {
488 brw_inst_set_urb_per_slot_offset(devinfo, insn,
489 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
490 }
491 }
492
493 static void
494 gen7_set_dp_scratch_message(struct brw_codegen *p,
495 brw_inst *inst,
496 bool write,
497 bool dword,
498 bool invalidate_after_read,
499 unsigned num_regs,
500 unsigned addr_offset,
501 unsigned mlen,
502 unsigned rlen,
503 bool header_present)
504 {
505 const struct gen_device_info *devinfo = p->devinfo;
506 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
507 (devinfo->gen >= 8 && num_regs == 8));
508 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
509 num_regs - 1);
510
511 brw_set_desc(p, inst, brw_message_desc(
512 devinfo, mlen, rlen, header_present));
513
514 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
515 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
516 brw_inst_set_scratch_read_write(devinfo, inst, write);
517 brw_inst_set_scratch_type(devinfo, inst, dword);
518 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
519 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
520 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
521 }
522
523 static void
524 brw_inst_set_state(const struct gen_device_info *devinfo,
525 brw_inst *insn,
526 const struct brw_insn_state *state)
527 {
528 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
529 brw_inst_set_group(devinfo, insn, state->group);
530 brw_inst_set_compression(devinfo, insn, state->compressed);
531 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
532 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
533 brw_inst_set_saturate(devinfo, insn, state->saturate);
534 brw_inst_set_pred_control(devinfo, insn, state->predicate);
535 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
536
537 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
538 state->access_mode == BRW_ALIGN_16) {
539 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
540 if (devinfo->gen >= 7)
541 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
542 } else {
543 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
544 if (devinfo->gen >= 7)
545 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
546 }
547
548 if (devinfo->gen >= 6)
549 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
550 }
551
552 #define next_insn brw_next_insn
553 brw_inst *
554 brw_next_insn(struct brw_codegen *p, unsigned opcode)
555 {
556 const struct gen_device_info *devinfo = p->devinfo;
557 brw_inst *insn;
558
559 if (p->nr_insn + 1 > p->store_size) {
560 p->store_size <<= 1;
561 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
562 }
563
564 p->next_insn_offset += 16;
565 insn = &p->store[p->nr_insn++];
566
567 memset(insn, 0, sizeof(*insn));
568 brw_inst_set_opcode(devinfo, insn, opcode);
569
570 /* Apply the default instruction state */
571 brw_inst_set_state(devinfo, insn, p->current);
572
573 return insn;
574 }
575
576 static brw_inst *
577 brw_alu1(struct brw_codegen *p, unsigned opcode,
578 struct brw_reg dest, struct brw_reg src)
579 {
580 brw_inst *insn = next_insn(p, opcode);
581 brw_set_dest(p, insn, dest);
582 brw_set_src0(p, insn, src);
583 return insn;
584 }
585
586 static brw_inst *
587 brw_alu2(struct brw_codegen *p, unsigned opcode,
588 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
589 {
590 /* 64-bit immediates are only supported on 1-src instructions */
591 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
592 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
593
594 brw_inst *insn = next_insn(p, opcode);
595 brw_set_dest(p, insn, dest);
596 brw_set_src0(p, insn, src0);
597 brw_set_src1(p, insn, src1);
598 return insn;
599 }
600
601 static int
602 get_3src_subreg_nr(struct brw_reg reg)
603 {
604 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
605 * use 32-bit units (components 0..7). Since they only support F/D/UD
606 * types, this doesn't lose any flexibility, but uses fewer bits.
607 */
608 return reg.subnr / 4;
609 }
610
611 static enum gen10_align1_3src_vertical_stride
612 to_3src_align1_vstride(enum brw_vertical_stride vstride)
613 {
614 switch (vstride) {
615 case BRW_VERTICAL_STRIDE_0:
616 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
617 case BRW_VERTICAL_STRIDE_2:
618 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
619 case BRW_VERTICAL_STRIDE_4:
620 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
621 case BRW_VERTICAL_STRIDE_8:
622 case BRW_VERTICAL_STRIDE_16:
623 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
624 default:
625 unreachable("invalid vstride");
626 }
627 }
628
629
630 static enum gen10_align1_3src_src_horizontal_stride
631 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
632 {
633 switch (hstride) {
634 case BRW_HORIZONTAL_STRIDE_0:
635 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
636 case BRW_HORIZONTAL_STRIDE_1:
637 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
638 case BRW_HORIZONTAL_STRIDE_2:
639 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
640 case BRW_HORIZONTAL_STRIDE_4:
641 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
642 default:
643 unreachable("invalid hstride");
644 }
645 }
646
647 static brw_inst *
648 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
649 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
650 {
651 const struct gen_device_info *devinfo = p->devinfo;
652 brw_inst *inst = next_insn(p, opcode);
653
654 gen7_convert_mrf_to_grf(p, &dest);
655
656 assert(dest.nr < 128);
657 assert(src0.nr < 128);
658 assert(src1.nr < 128);
659 assert(src2.nr < 128);
660 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
661 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
662 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
663 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
664
665 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
666 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
667 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
668
669 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
670 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
671 BRW_ALIGN1_3SRC_ACCUMULATOR);
672 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
673 } else {
674 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
675 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
676 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
677 }
678 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
679
680 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
681
682 if (brw_reg_type_is_floating_point(dest.type)) {
683 brw_inst_set_3src_a1_exec_type(devinfo, inst,
684 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
685 } else {
686 brw_inst_set_3src_a1_exec_type(devinfo, inst,
687 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
688 }
689
690 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
691 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
692 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
693 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
694
695 brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
696 to_3src_align1_vstride(src0.vstride));
697 brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
698 to_3src_align1_vstride(src1.vstride));
699 /* no vstride on src2 */
700
701 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
702 to_3src_align1_hstride(src0.hstride));
703 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
704 to_3src_align1_hstride(src1.hstride));
705 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
706 to_3src_align1_hstride(src2.hstride));
707
708 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
709 if (src0.type == BRW_REGISTER_TYPE_NF) {
710 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
711 } else {
712 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
713 }
714 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
715 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
716
717 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
718 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
719 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
720 } else {
721 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
722 }
723 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
724 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
725
726 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
727 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
728 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
729 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
730
731 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
732 src0.file == BRW_IMMEDIATE_VALUE ||
733 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
734 src0.type == BRW_REGISTER_TYPE_NF));
735 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
736 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
737 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
738 src2.file == BRW_IMMEDIATE_VALUE);
739
740 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
741 src0.file == BRW_GENERAL_REGISTER_FILE ?
742 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
743 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
744 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
745 src1.file == BRW_GENERAL_REGISTER_FILE ?
746 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
747 BRW_ALIGN1_3SRC_ACCUMULATOR);
748 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
749 src2.file == BRW_GENERAL_REGISTER_FILE ?
750 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
751 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
752 } else {
753 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
754 dest.file == BRW_MESSAGE_REGISTER_FILE);
755 assert(dest.type == BRW_REGISTER_TYPE_F ||
756 dest.type == BRW_REGISTER_TYPE_DF ||
757 dest.type == BRW_REGISTER_TYPE_D ||
758 dest.type == BRW_REGISTER_TYPE_UD);
759 if (devinfo->gen == 6) {
760 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
761 dest.file == BRW_MESSAGE_REGISTER_FILE);
762 }
763 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
764 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
765 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
766
767 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
768 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
769 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
770 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
771 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
772 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
773 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
774 src0.vstride == BRW_VERTICAL_STRIDE_0);
775
776 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
777 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
778 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
779 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
780 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
781 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
782 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
783 src1.vstride == BRW_VERTICAL_STRIDE_0);
784
785 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
786 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
787 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
788 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
789 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
790 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
791 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
792 src2.vstride == BRW_VERTICAL_STRIDE_0);
793
794 if (devinfo->gen >= 7) {
795 /* Set both the source and destination types based on dest.type,
796 * ignoring the source register types. The MAD and LRP emitters ensure
797 * that all four types are float. The BFE and BFI2 emitters, however,
798 * may send us mixed D and UD types and want us to ignore that and use
799 * the destination type.
800 */
801 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
802 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
803 }
804 }
805
806 return inst;
807 }
808
809
810 /***********************************************************************
811 * Convenience routines.
812 */
813 #define ALU1(OP) \
814 brw_inst *brw_##OP(struct brw_codegen *p, \
815 struct brw_reg dest, \
816 struct brw_reg src0) \
817 { \
818 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
819 }
820
821 #define ALU2(OP) \
822 brw_inst *brw_##OP(struct brw_codegen *p, \
823 struct brw_reg dest, \
824 struct brw_reg src0, \
825 struct brw_reg src1) \
826 { \
827 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
828 }
829
830 #define ALU3(OP) \
831 brw_inst *brw_##OP(struct brw_codegen *p, \
832 struct brw_reg dest, \
833 struct brw_reg src0, \
834 struct brw_reg src1, \
835 struct brw_reg src2) \
836 { \
837 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
838 }
839
840 #define ALU3F(OP) \
841 brw_inst *brw_##OP(struct brw_codegen *p, \
842 struct brw_reg dest, \
843 struct brw_reg src0, \
844 struct brw_reg src1, \
845 struct brw_reg src2) \
846 { \
847 assert(dest.type == BRW_REGISTER_TYPE_F || \
848 dest.type == BRW_REGISTER_TYPE_DF); \
849 if (dest.type == BRW_REGISTER_TYPE_F) { \
850 assert(src0.type == BRW_REGISTER_TYPE_F); \
851 assert(src1.type == BRW_REGISTER_TYPE_F); \
852 assert(src2.type == BRW_REGISTER_TYPE_F); \
853 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
854 assert(src0.type == BRW_REGISTER_TYPE_DF); \
855 assert(src1.type == BRW_REGISTER_TYPE_DF); \
856 assert(src2.type == BRW_REGISTER_TYPE_DF); \
857 } \
858 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
859 }
860
861 /* Rounding operations (other than RNDD) require two instructions - the first
862 * stores a rounded value (possibly the wrong way) in the dest register, but
863 * also sets a per-channel "increment bit" in the flag register. A predicated
864 * add of 1.0 fixes dest to contain the desired result.
865 *
866 * Sandybridge and later appear to round correctly without an ADD.
867 */
868 #define ROUND(OP) \
869 void brw_##OP(struct brw_codegen *p, \
870 struct brw_reg dest, \
871 struct brw_reg src) \
872 { \
873 const struct gen_device_info *devinfo = p->devinfo; \
874 brw_inst *rnd, *add; \
875 rnd = next_insn(p, BRW_OPCODE_##OP); \
876 brw_set_dest(p, rnd, dest); \
877 brw_set_src0(p, rnd, src); \
878 \
879 if (devinfo->gen < 6) { \
880 /* turn on round-increments */ \
881 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
882 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
883 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
884 } \
885 }
886
887
888 ALU2(SEL)
889 ALU1(NOT)
890 ALU2(AND)
891 ALU2(OR)
892 ALU2(XOR)
893 ALU2(SHR)
894 ALU2(SHL)
895 ALU1(DIM)
896 ALU2(ASR)
897 ALU3(CSEL)
898 ALU1(FRC)
899 ALU1(RNDD)
900 ALU2(MAC)
901 ALU2(MACH)
902 ALU1(LZD)
903 ALU2(DP4)
904 ALU2(DPH)
905 ALU2(DP3)
906 ALU2(DP2)
907 ALU3(MAD)
908 ALU3F(LRP)
909 ALU1(BFREV)
910 ALU3(BFE)
911 ALU2(BFI1)
912 ALU3(BFI2)
913 ALU1(FBH)
914 ALU1(FBL)
915 ALU1(CBIT)
916 ALU2(ADDC)
917 ALU2(SUBB)
918
919 ROUND(RNDZ)
920 ROUND(RNDE)
921
922 brw_inst *
923 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
924 {
925 const struct gen_device_info *devinfo = p->devinfo;
926
927 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
928 * To avoid the problems that causes, we use a <1,2,0> source region to read
929 * each element twice.
930 */
931 if (devinfo->gen == 7 && !devinfo->is_haswell &&
932 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
933 dest.type == BRW_REGISTER_TYPE_DF &&
934 (src0.type == BRW_REGISTER_TYPE_F ||
935 src0.type == BRW_REGISTER_TYPE_D ||
936 src0.type == BRW_REGISTER_TYPE_UD) &&
937 !has_scalar_region(src0)) {
938 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
939 src0.width == BRW_WIDTH_4 &&
940 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
941
942 src0.vstride = BRW_VERTICAL_STRIDE_1;
943 src0.width = BRW_WIDTH_2;
944 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
945 }
946
947 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
948 }
949
950 brw_inst *
951 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
952 struct brw_reg src0, struct brw_reg src1)
953 {
954 /* 6.2.2: add */
955 if (src0.type == BRW_REGISTER_TYPE_F ||
956 (src0.file == BRW_IMMEDIATE_VALUE &&
957 src0.type == BRW_REGISTER_TYPE_VF)) {
958 assert(src1.type != BRW_REGISTER_TYPE_UD);
959 assert(src1.type != BRW_REGISTER_TYPE_D);
960 }
961
962 if (src1.type == BRW_REGISTER_TYPE_F ||
963 (src1.file == BRW_IMMEDIATE_VALUE &&
964 src1.type == BRW_REGISTER_TYPE_VF)) {
965 assert(src0.type != BRW_REGISTER_TYPE_UD);
966 assert(src0.type != BRW_REGISTER_TYPE_D);
967 }
968
969 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
970 }
971
972 brw_inst *
973 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
974 struct brw_reg src0, struct brw_reg src1)
975 {
976 assert(dest.type == src0.type);
977 assert(src0.type == src1.type);
978 switch (src0.type) {
979 case BRW_REGISTER_TYPE_B:
980 case BRW_REGISTER_TYPE_UB:
981 case BRW_REGISTER_TYPE_W:
982 case BRW_REGISTER_TYPE_UW:
983 case BRW_REGISTER_TYPE_D:
984 case BRW_REGISTER_TYPE_UD:
985 break;
986 default:
987 unreachable("Bad type for brw_AVG");
988 }
989
990 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
991 }
992
993 brw_inst *
994 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
995 struct brw_reg src0, struct brw_reg src1)
996 {
997 /* 6.32.38: mul */
998 if (src0.type == BRW_REGISTER_TYPE_D ||
999 src0.type == BRW_REGISTER_TYPE_UD ||
1000 src1.type == BRW_REGISTER_TYPE_D ||
1001 src1.type == BRW_REGISTER_TYPE_UD) {
1002 assert(dest.type != BRW_REGISTER_TYPE_F);
1003 }
1004
1005 if (src0.type == BRW_REGISTER_TYPE_F ||
1006 (src0.file == BRW_IMMEDIATE_VALUE &&
1007 src0.type == BRW_REGISTER_TYPE_VF)) {
1008 assert(src1.type != BRW_REGISTER_TYPE_UD);
1009 assert(src1.type != BRW_REGISTER_TYPE_D);
1010 }
1011
1012 if (src1.type == BRW_REGISTER_TYPE_F ||
1013 (src1.file == BRW_IMMEDIATE_VALUE &&
1014 src1.type == BRW_REGISTER_TYPE_VF)) {
1015 assert(src0.type != BRW_REGISTER_TYPE_UD);
1016 assert(src0.type != BRW_REGISTER_TYPE_D);
1017 }
1018
1019 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1020 src0.nr != BRW_ARF_ACCUMULATOR);
1021 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1022 src1.nr != BRW_ARF_ACCUMULATOR);
1023
1024 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1025 }
1026
1027 brw_inst *
1028 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1029 struct brw_reg src0, struct brw_reg src1)
1030 {
1031 src0.vstride = BRW_VERTICAL_STRIDE_0;
1032 src0.width = BRW_WIDTH_1;
1033 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1034 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1035 }
1036
1037 brw_inst *
1038 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1039 struct brw_reg src0, struct brw_reg src1)
1040 {
1041 src0.vstride = BRW_VERTICAL_STRIDE_0;
1042 src0.width = BRW_WIDTH_1;
1043 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1044 src1.vstride = BRW_VERTICAL_STRIDE_8;
1045 src1.width = BRW_WIDTH_8;
1046 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1047 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1048 }
1049
1050 brw_inst *
1051 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1052 {
1053 const struct gen_device_info *devinfo = p->devinfo;
1054 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1055 /* The F32TO16 instruction doesn't support 32-bit destination types in
1056 * Align1 mode, and neither does the Gen8 implementation in terms of a
1057 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1058 * an undocumented feature.
1059 */
1060 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1061 (!align16 || devinfo->gen >= 8));
1062 brw_inst *inst;
1063
1064 if (align16) {
1065 assert(dst.type == BRW_REGISTER_TYPE_UD);
1066 } else {
1067 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1068 dst.type == BRW_REGISTER_TYPE_W ||
1069 dst.type == BRW_REGISTER_TYPE_UW ||
1070 dst.type == BRW_REGISTER_TYPE_HF);
1071 }
1072
1073 brw_push_insn_state(p);
1074
1075 if (needs_zero_fill) {
1076 brw_set_default_access_mode(p, BRW_ALIGN_1);
1077 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1078 }
1079
1080 if (devinfo->gen >= 8) {
1081 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1082 } else {
1083 assert(devinfo->gen == 7);
1084 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1085 }
1086
1087 if (needs_zero_fill) {
1088 brw_inst_set_no_dd_clear(devinfo, inst, true);
1089 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1090 brw_inst_set_no_dd_check(devinfo, inst, true);
1091 }
1092
1093 brw_pop_insn_state(p);
1094 return inst;
1095 }
1096
1097 brw_inst *
1098 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1099 {
1100 const struct gen_device_info *devinfo = p->devinfo;
1101 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1102
1103 if (align16) {
1104 assert(src.type == BRW_REGISTER_TYPE_UD);
1105 } else {
1106 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1107 *
1108 * Because this instruction does not have a 16-bit floating-point
1109 * type, the source data type must be Word (W). The destination type
1110 * must be F (Float).
1111 */
1112 if (src.type == BRW_REGISTER_TYPE_UD)
1113 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1114
1115 assert(src.type == BRW_REGISTER_TYPE_W ||
1116 src.type == BRW_REGISTER_TYPE_UW ||
1117 src.type == BRW_REGISTER_TYPE_HF);
1118 }
1119
1120 if (devinfo->gen >= 8) {
1121 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1122 } else {
1123 assert(devinfo->gen == 7);
1124 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1125 }
1126 }
1127
1128
1129 void brw_NOP(struct brw_codegen *p)
1130 {
1131 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1132 memset(insn, 0, sizeof(*insn));
1133 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1134 }
1135
1136
1137
1138
1139
1140 /***********************************************************************
1141 * Comparisons, if/else/endif
1142 */
1143
1144 brw_inst *
1145 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1146 unsigned predicate_control)
1147 {
1148 const struct gen_device_info *devinfo = p->devinfo;
1149 struct brw_reg ip = brw_ip_reg();
1150 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1151
1152 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1153 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1154 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1155 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1156
1157 return inst;
1158 }
1159
1160 static void
1161 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1162 {
1163 p->if_stack[p->if_stack_depth] = inst - p->store;
1164
1165 p->if_stack_depth++;
1166 if (p->if_stack_array_size <= p->if_stack_depth) {
1167 p->if_stack_array_size *= 2;
1168 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1169 p->if_stack_array_size);
1170 }
1171 }
1172
1173 static brw_inst *
1174 pop_if_stack(struct brw_codegen *p)
1175 {
1176 p->if_stack_depth--;
1177 return &p->store[p->if_stack[p->if_stack_depth]];
1178 }
1179
1180 static void
1181 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1182 {
1183 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1184 p->loop_stack_array_size *= 2;
1185 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1186 p->loop_stack_array_size);
1187 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1188 p->loop_stack_array_size);
1189 }
1190
1191 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1192 p->loop_stack_depth++;
1193 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1194 }
1195
1196 static brw_inst *
1197 get_inner_do_insn(struct brw_codegen *p)
1198 {
1199 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1200 }
1201
1202 /* EU takes the value from the flag register and pushes it onto some
1203 * sort of a stack (presumably merging with any flag value already on
1204 * the stack). Within an if block, the flags at the top of the stack
1205 * control execution on each channel of the unit, eg. on each of the
1206 * 16 pixel values in our wm programs.
1207 *
1208 * When the matching 'else' instruction is reached (presumably by
1209 * countdown of the instruction count patched in by our ELSE/ENDIF
1210 * functions), the relevant flags are inverted.
1211 *
1212 * When the matching 'endif' instruction is reached, the flags are
1213 * popped off. If the stack is now empty, normal execution resumes.
1214 */
1215 brw_inst *
1216 brw_IF(struct brw_codegen *p, unsigned execute_size)
1217 {
1218 const struct gen_device_info *devinfo = p->devinfo;
1219 brw_inst *insn;
1220
1221 insn = next_insn(p, BRW_OPCODE_IF);
1222
1223 /* Override the defaults for this instruction:
1224 */
1225 if (devinfo->gen < 6) {
1226 brw_set_dest(p, insn, brw_ip_reg());
1227 brw_set_src0(p, insn, brw_ip_reg());
1228 brw_set_src1(p, insn, brw_imm_d(0x0));
1229 } else if (devinfo->gen == 6) {
1230 brw_set_dest(p, insn, brw_imm_w(0));
1231 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1232 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1233 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1234 } else if (devinfo->gen == 7) {
1235 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1236 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1237 brw_set_src1(p, insn, brw_imm_w(0));
1238 brw_inst_set_jip(devinfo, insn, 0);
1239 brw_inst_set_uip(devinfo, insn, 0);
1240 } else {
1241 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1242 brw_set_src0(p, insn, brw_imm_d(0));
1243 brw_inst_set_jip(devinfo, insn, 0);
1244 brw_inst_set_uip(devinfo, insn, 0);
1245 }
1246
1247 brw_inst_set_exec_size(devinfo, insn, execute_size);
1248 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1249 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1250 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1251 if (!p->single_program_flow && devinfo->gen < 6)
1252 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1253
1254 push_if_stack(p, insn);
1255 p->if_depth_in_loop[p->loop_stack_depth]++;
1256 return insn;
1257 }
1258
1259 /* This function is only used for gen6-style IF instructions with an
1260 * embedded comparison (conditional modifier). It is not used on gen7.
1261 */
1262 brw_inst *
1263 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1264 struct brw_reg src0, struct brw_reg src1)
1265 {
1266 const struct gen_device_info *devinfo = p->devinfo;
1267 brw_inst *insn;
1268
1269 insn = next_insn(p, BRW_OPCODE_IF);
1270
1271 brw_set_dest(p, insn, brw_imm_w(0));
1272 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1273 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1274 brw_set_src0(p, insn, src0);
1275 brw_set_src1(p, insn, src1);
1276
1277 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1278 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1279 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1280
1281 push_if_stack(p, insn);
1282 return insn;
1283 }
1284
1285 /**
1286 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1287 */
1288 static void
1289 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1290 brw_inst *if_inst, brw_inst *else_inst)
1291 {
1292 const struct gen_device_info *devinfo = p->devinfo;
1293
1294 /* The next instruction (where the ENDIF would be, if it existed) */
1295 brw_inst *next_inst = &p->store[p->nr_insn];
1296
1297 assert(p->single_program_flow);
1298 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1299 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1300 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1301
1302 /* Convert IF to an ADD instruction that moves the instruction pointer
1303 * to the first instruction of the ELSE block. If there is no ELSE
1304 * block, point to where ENDIF would be. Reverse the predicate.
1305 *
1306 * There's no need to execute an ENDIF since we don't need to do any
1307 * stack operations, and if we're currently executing, we just want to
1308 * continue normally.
1309 */
1310 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1311 brw_inst_set_pred_inv(devinfo, if_inst, true);
1312
1313 if (else_inst != NULL) {
1314 /* Convert ELSE to an ADD instruction that points where the ENDIF
1315 * would be.
1316 */
1317 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1318
1319 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1320 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1321 } else {
1322 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1323 }
1324 }
1325
1326 /**
1327 * Patch IF and ELSE instructions with appropriate jump targets.
1328 */
1329 static void
1330 patch_IF_ELSE(struct brw_codegen *p,
1331 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1332 {
1333 const struct gen_device_info *devinfo = p->devinfo;
1334
1335 /* We shouldn't be patching IF and ELSE instructions in single program flow
1336 * mode when gen < 6, because in single program flow mode on those
1337 * platforms, we convert flow control instructions to conditional ADDs that
1338 * operate on IP (see brw_ENDIF).
1339 *
1340 * However, on Gen6, writing to IP doesn't work in single program flow mode
1341 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1342 * not be updated by non-flow control instructions."). And on later
1343 * platforms, there is no significant benefit to converting control flow
1344 * instructions to conditional ADDs. So we do patch IF and ELSE
1345 * instructions in single program flow mode on those platforms.
1346 */
1347 if (devinfo->gen < 6)
1348 assert(!p->single_program_flow);
1349
1350 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1351 assert(endif_inst != NULL);
1352 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1353
1354 unsigned br = brw_jump_scale(devinfo);
1355
1356 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1357 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1358
1359 if (else_inst == NULL) {
1360 /* Patch IF -> ENDIF */
1361 if (devinfo->gen < 6) {
1362 /* Turn it into an IFF, which means no mask stack operations for
1363 * all-false and jumping past the ENDIF.
1364 */
1365 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1366 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1367 br * (endif_inst - if_inst + 1));
1368 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1369 } else if (devinfo->gen == 6) {
1370 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1371 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1372 } else {
1373 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1374 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1375 }
1376 } else {
1377 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1378
1379 /* Patch IF -> ELSE */
1380 if (devinfo->gen < 6) {
1381 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1382 br * (else_inst - if_inst));
1383 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1384 } else if (devinfo->gen == 6) {
1385 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1386 br * (else_inst - if_inst + 1));
1387 }
1388
1389 /* Patch ELSE -> ENDIF */
1390 if (devinfo->gen < 6) {
1391 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1392 * matching ENDIF.
1393 */
1394 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1395 br * (endif_inst - else_inst + 1));
1396 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1397 } else if (devinfo->gen == 6) {
1398 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1399 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1400 br * (endif_inst - else_inst));
1401 } else {
1402 /* The IF instruction's JIP should point just past the ELSE */
1403 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1404 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1405 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1406 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1407 if (devinfo->gen >= 8) {
1408 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1409 * should point to ENDIF.
1410 */
1411 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1412 }
1413 }
1414 }
1415 }
1416
1417 void
1418 brw_ELSE(struct brw_codegen *p)
1419 {
1420 const struct gen_device_info *devinfo = p->devinfo;
1421 brw_inst *insn;
1422
1423 insn = next_insn(p, BRW_OPCODE_ELSE);
1424
1425 if (devinfo->gen < 6) {
1426 brw_set_dest(p, insn, brw_ip_reg());
1427 brw_set_src0(p, insn, brw_ip_reg());
1428 brw_set_src1(p, insn, brw_imm_d(0x0));
1429 } else if (devinfo->gen == 6) {
1430 brw_set_dest(p, insn, brw_imm_w(0));
1431 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1432 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1433 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1434 } else if (devinfo->gen == 7) {
1435 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1436 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1437 brw_set_src1(p, insn, brw_imm_w(0));
1438 brw_inst_set_jip(devinfo, insn, 0);
1439 brw_inst_set_uip(devinfo, insn, 0);
1440 } else {
1441 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1442 brw_set_src0(p, insn, brw_imm_d(0));
1443 brw_inst_set_jip(devinfo, insn, 0);
1444 brw_inst_set_uip(devinfo, insn, 0);
1445 }
1446
1447 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1448 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1449 if (!p->single_program_flow && devinfo->gen < 6)
1450 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1451
1452 push_if_stack(p, insn);
1453 }
1454
1455 void
1456 brw_ENDIF(struct brw_codegen *p)
1457 {
1458 const struct gen_device_info *devinfo = p->devinfo;
1459 brw_inst *insn = NULL;
1460 brw_inst *else_inst = NULL;
1461 brw_inst *if_inst = NULL;
1462 brw_inst *tmp;
1463 bool emit_endif = true;
1464
1465 /* In single program flow mode, we can express IF and ELSE instructions
1466 * equivalently as ADD instructions that operate on IP. On platforms prior
1467 * to Gen6, flow control instructions cause an implied thread switch, so
1468 * this is a significant savings.
1469 *
1470 * However, on Gen6, writing to IP doesn't work in single program flow mode
1471 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1472 * not be updated by non-flow control instructions."). And on later
1473 * platforms, there is no significant benefit to converting control flow
1474 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1475 * Gen5.
1476 */
1477 if (devinfo->gen < 6 && p->single_program_flow)
1478 emit_endif = false;
1479
1480 /*
1481 * A single next_insn() may change the base address of instruction store
1482 * memory(p->store), so call it first before referencing the instruction
1483 * store pointer from an index
1484 */
1485 if (emit_endif)
1486 insn = next_insn(p, BRW_OPCODE_ENDIF);
1487
1488 /* Pop the IF and (optional) ELSE instructions from the stack */
1489 p->if_depth_in_loop[p->loop_stack_depth]--;
1490 tmp = pop_if_stack(p);
1491 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1492 else_inst = tmp;
1493 tmp = pop_if_stack(p);
1494 }
1495 if_inst = tmp;
1496
1497 if (!emit_endif) {
1498 /* ENDIF is useless; don't bother emitting it. */
1499 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1500 return;
1501 }
1502
1503 if (devinfo->gen < 6) {
1504 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1505 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1506 brw_set_src1(p, insn, brw_imm_d(0x0));
1507 } else if (devinfo->gen == 6) {
1508 brw_set_dest(p, insn, brw_imm_w(0));
1509 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1510 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1511 } else if (devinfo->gen == 7) {
1512 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1513 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1514 brw_set_src1(p, insn, brw_imm_w(0));
1515 } else {
1516 brw_set_src0(p, insn, brw_imm_d(0));
1517 }
1518
1519 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1520 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1521 if (devinfo->gen < 6)
1522 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1523
1524 /* Also pop item off the stack in the endif instruction: */
1525 if (devinfo->gen < 6) {
1526 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1527 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1528 } else if (devinfo->gen == 6) {
1529 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1530 } else {
1531 brw_inst_set_jip(devinfo, insn, 2);
1532 }
1533 patch_IF_ELSE(p, if_inst, else_inst, insn);
1534 }
1535
1536 brw_inst *
1537 brw_BREAK(struct brw_codegen *p)
1538 {
1539 const struct gen_device_info *devinfo = p->devinfo;
1540 brw_inst *insn;
1541
1542 insn = next_insn(p, BRW_OPCODE_BREAK);
1543 if (devinfo->gen >= 8) {
1544 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1545 brw_set_src0(p, insn, brw_imm_d(0x0));
1546 } else if (devinfo->gen >= 6) {
1547 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1548 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1549 brw_set_src1(p, insn, brw_imm_d(0x0));
1550 } else {
1551 brw_set_dest(p, insn, brw_ip_reg());
1552 brw_set_src0(p, insn, brw_ip_reg());
1553 brw_set_src1(p, insn, brw_imm_d(0x0));
1554 brw_inst_set_gen4_pop_count(devinfo, insn,
1555 p->if_depth_in_loop[p->loop_stack_depth]);
1556 }
1557 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1558 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1559
1560 return insn;
1561 }
1562
1563 brw_inst *
1564 brw_CONT(struct brw_codegen *p)
1565 {
1566 const struct gen_device_info *devinfo = p->devinfo;
1567 brw_inst *insn;
1568
1569 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1570 brw_set_dest(p, insn, brw_ip_reg());
1571 if (devinfo->gen >= 8) {
1572 brw_set_src0(p, insn, brw_imm_d(0x0));
1573 } else {
1574 brw_set_src0(p, insn, brw_ip_reg());
1575 brw_set_src1(p, insn, brw_imm_d(0x0));
1576 }
1577
1578 if (devinfo->gen < 6) {
1579 brw_inst_set_gen4_pop_count(devinfo, insn,
1580 p->if_depth_in_loop[p->loop_stack_depth]);
1581 }
1582 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1583 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1584 return insn;
1585 }
1586
1587 brw_inst *
1588 gen6_HALT(struct brw_codegen *p)
1589 {
1590 const struct gen_device_info *devinfo = p->devinfo;
1591 brw_inst *insn;
1592
1593 insn = next_insn(p, BRW_OPCODE_HALT);
1594 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1595 if (devinfo->gen >= 8) {
1596 brw_set_src0(p, insn, brw_imm_d(0x0));
1597 } else {
1598 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1599 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1600 }
1601
1602 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1603 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1604 return insn;
1605 }
1606
1607 /* DO/WHILE loop:
1608 *
1609 * The DO/WHILE is just an unterminated loop -- break or continue are
1610 * used for control within the loop. We have a few ways they can be
1611 * done.
1612 *
1613 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1614 * jip and no DO instruction.
1615 *
1616 * For non-uniform control flow pre-gen6, there's a DO instruction to
1617 * push the mask, and a WHILE to jump back, and BREAK to get out and
1618 * pop the mask.
1619 *
1620 * For gen6, there's no more mask stack, so no need for DO. WHILE
1621 * just points back to the first instruction of the loop.
1622 */
1623 brw_inst *
1624 brw_DO(struct brw_codegen *p, unsigned execute_size)
1625 {
1626 const struct gen_device_info *devinfo = p->devinfo;
1627
1628 if (devinfo->gen >= 6 || p->single_program_flow) {
1629 push_loop_stack(p, &p->store[p->nr_insn]);
1630 return &p->store[p->nr_insn];
1631 } else {
1632 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1633
1634 push_loop_stack(p, insn);
1635
1636 /* Override the defaults for this instruction:
1637 */
1638 brw_set_dest(p, insn, brw_null_reg());
1639 brw_set_src0(p, insn, brw_null_reg());
1640 brw_set_src1(p, insn, brw_null_reg());
1641
1642 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1643 brw_inst_set_exec_size(devinfo, insn, execute_size);
1644 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1645
1646 return insn;
1647 }
1648 }
1649
1650 /**
1651 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1652 * instruction here.
1653 *
1654 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1655 * nesting, since it can always just point to the end of the block/current loop.
1656 */
1657 static void
1658 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1659 {
1660 const struct gen_device_info *devinfo = p->devinfo;
1661 brw_inst *do_inst = get_inner_do_insn(p);
1662 brw_inst *inst;
1663 unsigned br = brw_jump_scale(devinfo);
1664
1665 assert(devinfo->gen < 6);
1666
1667 for (inst = while_inst - 1; inst != do_inst; inst--) {
1668 /* If the jump count is != 0, that means that this instruction has already
1669 * been patched because it's part of a loop inside of the one we're
1670 * patching.
1671 */
1672 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1673 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1674 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1675 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1676 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1677 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1678 }
1679 }
1680 }
1681
1682 brw_inst *
1683 brw_WHILE(struct brw_codegen *p)
1684 {
1685 const struct gen_device_info *devinfo = p->devinfo;
1686 brw_inst *insn, *do_insn;
1687 unsigned br = brw_jump_scale(devinfo);
1688
1689 if (devinfo->gen >= 6) {
1690 insn = next_insn(p, BRW_OPCODE_WHILE);
1691 do_insn = get_inner_do_insn(p);
1692
1693 if (devinfo->gen >= 8) {
1694 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1695 brw_set_src0(p, insn, brw_imm_d(0));
1696 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1697 } else if (devinfo->gen == 7) {
1698 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1699 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1700 brw_set_src1(p, insn, brw_imm_w(0));
1701 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1702 } else {
1703 brw_set_dest(p, insn, brw_imm_w(0));
1704 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1705 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1706 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1707 }
1708
1709 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1710
1711 } else {
1712 if (p->single_program_flow) {
1713 insn = next_insn(p, BRW_OPCODE_ADD);
1714 do_insn = get_inner_do_insn(p);
1715
1716 brw_set_dest(p, insn, brw_ip_reg());
1717 brw_set_src0(p, insn, brw_ip_reg());
1718 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1719 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1720 } else {
1721 insn = next_insn(p, BRW_OPCODE_WHILE);
1722 do_insn = get_inner_do_insn(p);
1723
1724 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1725
1726 brw_set_dest(p, insn, brw_ip_reg());
1727 brw_set_src0(p, insn, brw_ip_reg());
1728 brw_set_src1(p, insn, brw_imm_d(0));
1729
1730 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1731 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1732 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1733
1734 brw_patch_break_cont(p, insn);
1735 }
1736 }
1737 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1738
1739 p->loop_stack_depth--;
1740
1741 return insn;
1742 }
1743
1744 /* FORWARD JUMPS:
1745 */
1746 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1747 {
1748 const struct gen_device_info *devinfo = p->devinfo;
1749 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1750 unsigned jmpi = 1;
1751
1752 if (devinfo->gen >= 5)
1753 jmpi = 2;
1754
1755 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1756 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1757
1758 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1759 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1760 }
1761
1762 /* To integrate with the above, it makes sense that the comparison
1763 * instruction should populate the flag register. It might be simpler
1764 * just to use the flag reg for most WM tasks?
1765 */
1766 void brw_CMP(struct brw_codegen *p,
1767 struct brw_reg dest,
1768 unsigned conditional,
1769 struct brw_reg src0,
1770 struct brw_reg src1)
1771 {
1772 const struct gen_device_info *devinfo = p->devinfo;
1773 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1774
1775 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1776 brw_set_dest(p, insn, dest);
1777 brw_set_src0(p, insn, src0);
1778 brw_set_src1(p, insn, src1);
1779
1780 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1781 * page says:
1782 * "Any CMP instruction with a null destination must use a {switch}."
1783 *
1784 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1785 * mentioned on their work-arounds pages.
1786 */
1787 if (devinfo->gen == 7) {
1788 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1789 dest.nr == BRW_ARF_NULL) {
1790 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1791 }
1792 }
1793 }
1794
1795 /***********************************************************************
1796 * Helpers for the various SEND message types:
1797 */
1798
1799 /** Extended math function, float[8].
1800 */
1801 void gen4_math(struct brw_codegen *p,
1802 struct brw_reg dest,
1803 unsigned function,
1804 unsigned msg_reg_nr,
1805 struct brw_reg src,
1806 unsigned precision )
1807 {
1808 const struct gen_device_info *devinfo = p->devinfo;
1809 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1810 unsigned data_type;
1811 if (has_scalar_region(src)) {
1812 data_type = BRW_MATH_DATA_SCALAR;
1813 } else {
1814 data_type = BRW_MATH_DATA_VECTOR;
1815 }
1816
1817 assert(devinfo->gen < 6);
1818
1819 /* Example code doesn't set predicate_control for send
1820 * instructions.
1821 */
1822 brw_inst_set_pred_control(devinfo, insn, 0);
1823 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1824
1825 brw_set_dest(p, insn, dest);
1826 brw_set_src0(p, insn, src);
1827 brw_set_math_message(p,
1828 insn,
1829 function,
1830 src.type == BRW_REGISTER_TYPE_D,
1831 precision,
1832 data_type);
1833 }
1834
1835 void gen6_math(struct brw_codegen *p,
1836 struct brw_reg dest,
1837 unsigned function,
1838 struct brw_reg src0,
1839 struct brw_reg src1)
1840 {
1841 const struct gen_device_info *devinfo = p->devinfo;
1842 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1843
1844 assert(devinfo->gen >= 6);
1845
1846 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1847 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1848
1849 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1850 if (devinfo->gen == 6) {
1851 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1852 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1853 }
1854
1855 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1856 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1857 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1858 assert(src0.type != BRW_REGISTER_TYPE_F);
1859 assert(src1.type != BRW_REGISTER_TYPE_F);
1860 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1861 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1862 } else {
1863 assert(src0.type == BRW_REGISTER_TYPE_F);
1864 assert(src1.type == BRW_REGISTER_TYPE_F);
1865 }
1866
1867 /* Source modifiers are ignored for extended math instructions on Gen6. */
1868 if (devinfo->gen == 6) {
1869 assert(!src0.negate);
1870 assert(!src0.abs);
1871 assert(!src1.negate);
1872 assert(!src1.abs);
1873 }
1874
1875 brw_inst_set_math_function(devinfo, insn, function);
1876
1877 brw_set_dest(p, insn, dest);
1878 brw_set_src0(p, insn, src0);
1879 brw_set_src1(p, insn, src1);
1880 }
1881
1882 /**
1883 * Return the right surface index to access the thread scratch space using
1884 * stateless dataport messages.
1885 */
1886 unsigned
1887 brw_scratch_surface_idx(const struct brw_codegen *p)
1888 {
1889 /* The scratch space is thread-local so IA coherency is unnecessary. */
1890 if (p->devinfo->gen >= 8)
1891 return GEN8_BTI_STATELESS_NON_COHERENT;
1892 else
1893 return BRW_BTI_STATELESS;
1894 }
1895
1896 /**
1897 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1898 * using a constant offset per channel.
1899 *
1900 * The offset must be aligned to oword size (16 bytes). Used for
1901 * register spilling.
1902 */
1903 void brw_oword_block_write_scratch(struct brw_codegen *p,
1904 struct brw_reg mrf,
1905 int num_regs,
1906 unsigned offset)
1907 {
1908 const struct gen_device_info *devinfo = p->devinfo;
1909 const unsigned target_cache =
1910 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1911 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1912 BRW_SFID_DATAPORT_WRITE);
1913 uint32_t msg_type;
1914
1915 if (devinfo->gen >= 6)
1916 offset /= 16;
1917
1918 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1919
1920 const unsigned mlen = 1 + num_regs;
1921
1922 /* Set up the message header. This is g0, with g0.2 filled with
1923 * the offset. We don't want to leave our offset around in g0 or
1924 * it'll screw up texture samples, so set it up inside the message
1925 * reg.
1926 */
1927 {
1928 brw_push_insn_state(p);
1929 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1930 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1931 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1932
1933 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1934
1935 /* set message header global offset field (reg 0, element 2) */
1936 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1937 brw_MOV(p,
1938 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1939 mrf.nr,
1940 2), BRW_REGISTER_TYPE_UD),
1941 brw_imm_ud(offset));
1942
1943 brw_pop_insn_state(p);
1944 }
1945
1946 {
1947 struct brw_reg dest;
1948 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1949 int send_commit_msg;
1950 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1951 BRW_REGISTER_TYPE_UW);
1952
1953 brw_inst_set_sfid(devinfo, insn, target_cache);
1954 brw_inst_set_compression(devinfo, insn, false);
1955
1956 if (brw_inst_exec_size(devinfo, insn) >= 16)
1957 src_header = vec16(src_header);
1958
1959 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1960 if (devinfo->gen < 6)
1961 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
1962
1963 /* Until gen6, writes followed by reads from the same location
1964 * are not guaranteed to be ordered unless write_commit is set.
1965 * If set, then a no-op write is issued to the destination
1966 * register to set a dependency, and a read from the destination
1967 * can be used to ensure the ordering.
1968 *
1969 * For gen6, only writes between different threads need ordering
1970 * protection. Our use of DP writes is all about register
1971 * spilling within a thread.
1972 */
1973 if (devinfo->gen >= 6) {
1974 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1975 send_commit_msg = 0;
1976 } else {
1977 dest = src_header;
1978 send_commit_msg = 1;
1979 }
1980
1981 brw_set_dest(p, insn, dest);
1982 if (devinfo->gen >= 6) {
1983 brw_set_src0(p, insn, mrf);
1984 } else {
1985 brw_set_src0(p, insn, brw_null_reg());
1986 }
1987
1988 if (devinfo->gen >= 6)
1989 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1990 else
1991 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1992
1993 brw_set_desc(p, insn,
1994 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
1995 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
1996 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
1997 msg_type, 0, /* not a render target */
1998 send_commit_msg));
1999 }
2000 }
2001
2002
2003 /**
2004 * Read a block of owords (half a GRF each) from the scratch buffer
2005 * using a constant index per channel.
2006 *
2007 * Offset must be aligned to oword size (16 bytes). Used for register
2008 * spilling.
2009 */
2010 void
2011 brw_oword_block_read_scratch(struct brw_codegen *p,
2012 struct brw_reg dest,
2013 struct brw_reg mrf,
2014 int num_regs,
2015 unsigned offset)
2016 {
2017 const struct gen_device_info *devinfo = p->devinfo;
2018
2019 if (devinfo->gen >= 6)
2020 offset /= 16;
2021
2022 if (p->devinfo->gen >= 7) {
2023 /* On gen 7 and above, we no longer have message registers and we can
2024 * send from any register we want. By using the destination register
2025 * for the message, we guarantee that the implied message write won't
2026 * accidentally overwrite anything. This has been a problem because
2027 * the MRF registers and source for the final FB write are both fixed
2028 * and may overlap.
2029 */
2030 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2031 } else {
2032 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2033 }
2034 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2035
2036 const unsigned rlen = num_regs;
2037 const unsigned target_cache =
2038 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2039 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2040 BRW_SFID_DATAPORT_READ);
2041
2042 {
2043 brw_push_insn_state(p);
2044 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2045 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2046 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2047
2048 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2049
2050 /* set message header global offset field (reg 0, element 2) */
2051 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2052 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2053
2054 brw_pop_insn_state(p);
2055 }
2056
2057 {
2058 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2059
2060 brw_inst_set_sfid(devinfo, insn, target_cache);
2061 assert(brw_inst_pred_control(devinfo, insn) == 0);
2062 brw_inst_set_compression(devinfo, insn, false);
2063
2064 brw_set_dest(p, insn, dest); /* UW? */
2065 if (devinfo->gen >= 6) {
2066 brw_set_src0(p, insn, mrf);
2067 } else {
2068 brw_set_src0(p, insn, brw_null_reg());
2069 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2070 }
2071
2072 brw_set_desc(p, insn,
2073 brw_message_desc(devinfo, 1, rlen, true) |
2074 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2075 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2076 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2077 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2078 }
2079 }
2080
2081 void
2082 gen7_block_read_scratch(struct brw_codegen *p,
2083 struct brw_reg dest,
2084 int num_regs,
2085 unsigned offset)
2086 {
2087 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2088 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2089
2090 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2091
2092 /* The HW requires that the header is present; this is to get the g0.5
2093 * scratch offset.
2094 */
2095 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2096
2097 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2098 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2099 * is 32 bytes, which happens to be the size of a register.
2100 */
2101 offset /= REG_SIZE;
2102 assert(offset < (1 << 12));
2103
2104 gen7_set_dp_scratch_message(p, insn,
2105 false, /* scratch read */
2106 false, /* OWords */
2107 false, /* invalidate after read */
2108 num_regs,
2109 offset,
2110 1, /* mlen: just g0 */
2111 num_regs, /* rlen */
2112 true); /* header present */
2113 }
2114
2115 /**
2116 * Read float[4] vectors from the data port constant cache.
2117 * Location (in buffer) should be a multiple of 16.
2118 * Used for fetching shader constants.
2119 */
2120 void brw_oword_block_read(struct brw_codegen *p,
2121 struct brw_reg dest,
2122 struct brw_reg mrf,
2123 uint32_t offset,
2124 uint32_t bind_table_index)
2125 {
2126 const struct gen_device_info *devinfo = p->devinfo;
2127 const unsigned target_cache =
2128 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2129 BRW_SFID_DATAPORT_READ);
2130 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2131
2132 /* On newer hardware, offset is in units of owords. */
2133 if (devinfo->gen >= 6)
2134 offset /= 16;
2135
2136 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2137
2138 brw_push_insn_state(p);
2139 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2140 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2141 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2142
2143 brw_push_insn_state(p);
2144 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2145 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2146
2147 /* set message header global offset field (reg 0, element 2) */
2148 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2149 brw_MOV(p,
2150 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2151 mrf.nr,
2152 2), BRW_REGISTER_TYPE_UD),
2153 brw_imm_ud(offset));
2154 brw_pop_insn_state(p);
2155
2156 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2157
2158 brw_inst_set_sfid(devinfo, insn, target_cache);
2159
2160 /* cast dest to a uword[8] vector */
2161 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2162
2163 brw_set_dest(p, insn, dest);
2164 if (devinfo->gen >= 6) {
2165 brw_set_src0(p, insn, mrf);
2166 } else {
2167 brw_set_src0(p, insn, brw_null_reg());
2168 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2169 }
2170
2171 brw_set_desc(p, insn,
2172 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2173 brw_dp_read_desc(devinfo, bind_table_index,
2174 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2175 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2176 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2177
2178 brw_pop_insn_state(p);
2179 }
2180
2181 brw_inst *
2182 brw_fb_WRITE(struct brw_codegen *p,
2183 struct brw_reg payload,
2184 struct brw_reg implied_header,
2185 unsigned msg_control,
2186 unsigned binding_table_index,
2187 unsigned msg_length,
2188 unsigned response_length,
2189 bool eot,
2190 bool last_render_target,
2191 bool header_present)
2192 {
2193 const struct gen_device_info *devinfo = p->devinfo;
2194 const unsigned target_cache =
2195 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2196 BRW_SFID_DATAPORT_WRITE);
2197 brw_inst *insn;
2198 unsigned msg_type;
2199 struct brw_reg dest, src0;
2200
2201 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2202 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2203 else
2204 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2205
2206 if (devinfo->gen >= 6) {
2207 insn = next_insn(p, BRW_OPCODE_SENDC);
2208 } else {
2209 insn = next_insn(p, BRW_OPCODE_SEND);
2210 }
2211 brw_inst_set_sfid(devinfo, insn, target_cache);
2212 brw_inst_set_compression(devinfo, insn, false);
2213
2214 if (devinfo->gen >= 6) {
2215 /* headerless version, just submit color payload */
2216 src0 = payload;
2217
2218 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2219 } else {
2220 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2221 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2222 src0 = implied_header;
2223
2224 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2225 }
2226
2227 brw_set_dest(p, insn, dest);
2228 brw_set_src0(p, insn, src0);
2229 brw_set_desc(p, insn,
2230 brw_message_desc(devinfo, msg_length, response_length,
2231 header_present) |
2232 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2233 msg_type, last_render_target,
2234 0 /* send_commit_msg */));
2235 brw_inst_set_eot(devinfo, insn, eot);
2236
2237 return insn;
2238 }
2239
2240 brw_inst *
2241 gen9_fb_READ(struct brw_codegen *p,
2242 struct brw_reg dst,
2243 struct brw_reg payload,
2244 unsigned binding_table_index,
2245 unsigned msg_length,
2246 unsigned response_length,
2247 bool per_sample)
2248 {
2249 const struct gen_device_info *devinfo = p->devinfo;
2250 assert(devinfo->gen >= 9);
2251 const unsigned msg_subtype =
2252 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2253 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2254
2255 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2256 brw_set_dest(p, insn, dst);
2257 brw_set_src0(p, insn, payload);
2258 brw_set_desc(
2259 p, insn,
2260 brw_message_desc(devinfo, msg_length, response_length, true) |
2261 brw_dp_read_desc(devinfo, binding_table_index,
2262 per_sample << 5 | msg_subtype,
2263 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2264 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2265 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2266
2267 return insn;
2268 }
2269
2270 /**
2271 * Texture sample instruction.
2272 * Note: the msg_type plus msg_length values determine exactly what kind
2273 * of sampling operation is performed. See volume 4, page 161 of docs.
2274 */
2275 void brw_SAMPLE(struct brw_codegen *p,
2276 struct brw_reg dest,
2277 unsigned msg_reg_nr,
2278 struct brw_reg src0,
2279 unsigned binding_table_index,
2280 unsigned sampler,
2281 unsigned msg_type,
2282 unsigned response_length,
2283 unsigned msg_length,
2284 unsigned header_present,
2285 unsigned simd_mode,
2286 unsigned return_format)
2287 {
2288 const struct gen_device_info *devinfo = p->devinfo;
2289 brw_inst *insn;
2290
2291 if (msg_reg_nr != -1)
2292 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2293
2294 insn = next_insn(p, BRW_OPCODE_SEND);
2295 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2296 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2297
2298 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2299 *
2300 * "Instruction compression is not allowed for this instruction (that
2301 * is, send). The hardware behavior is undefined if this instruction is
2302 * set as compressed. However, compress control can be set to "SecHalf"
2303 * to affect the EMask generation."
2304 *
2305 * No similar wording is found in later PRMs, but there are examples
2306 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2307 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2308 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2309 */
2310 brw_inst_set_compression(devinfo, insn, false);
2311
2312 if (devinfo->gen < 6)
2313 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2314
2315 brw_set_dest(p, insn, dest);
2316 brw_set_src0(p, insn, src0);
2317 brw_set_desc(p, insn,
2318 brw_message_desc(devinfo, msg_length, response_length,
2319 header_present) |
2320 brw_sampler_desc(devinfo, binding_table_index, sampler,
2321 msg_type, simd_mode, return_format));
2322 }
2323
2324 /* Adjust the message header's sampler state pointer to
2325 * select the correct group of 16 samplers.
2326 */
2327 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2328 struct brw_reg header,
2329 struct brw_reg sampler_index)
2330 {
2331 /* The "Sampler Index" field can only store values between 0 and 15.
2332 * However, we can add an offset to the "Sampler State Pointer"
2333 * field, effectively selecting a different set of 16 samplers.
2334 *
2335 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2336 * offset, and each sampler state is only 16-bytes, so we can't
2337 * exclusively use the offset - we have to use both.
2338 */
2339
2340 const struct gen_device_info *devinfo = p->devinfo;
2341
2342 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2343 const int sampler_state_size = 16; /* 16 bytes */
2344 uint32_t sampler = sampler_index.ud;
2345
2346 if (sampler >= 16) {
2347 assert(devinfo->is_haswell || devinfo->gen >= 8);
2348 brw_ADD(p,
2349 get_element_ud(header, 3),
2350 get_element_ud(brw_vec8_grf(0, 0), 3),
2351 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2352 }
2353 } else {
2354 /* Non-const sampler array indexing case */
2355 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2356 return;
2357 }
2358
2359 struct brw_reg temp = get_element_ud(header, 3);
2360
2361 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2362 brw_SHL(p, temp, temp, brw_imm_ud(4));
2363 brw_ADD(p,
2364 get_element_ud(header, 3),
2365 get_element_ud(brw_vec8_grf(0, 0), 3),
2366 temp);
2367 }
2368 }
2369
2370 /* All these variables are pretty confusing - we might be better off
2371 * using bitmasks and macros for this, in the old style. Or perhaps
2372 * just having the caller instantiate the fields in dword3 itself.
2373 */
2374 void brw_urb_WRITE(struct brw_codegen *p,
2375 struct brw_reg dest,
2376 unsigned msg_reg_nr,
2377 struct brw_reg src0,
2378 enum brw_urb_write_flags flags,
2379 unsigned msg_length,
2380 unsigned response_length,
2381 unsigned offset,
2382 unsigned swizzle)
2383 {
2384 const struct gen_device_info *devinfo = p->devinfo;
2385 brw_inst *insn;
2386
2387 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2388
2389 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2390 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2391 brw_push_insn_state(p);
2392 brw_set_default_access_mode(p, BRW_ALIGN_1);
2393 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2394 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2395 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2396 BRW_REGISTER_TYPE_UD),
2397 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2398 brw_imm_ud(0xff00));
2399 brw_pop_insn_state(p);
2400 }
2401
2402 insn = next_insn(p, BRW_OPCODE_SEND);
2403
2404 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2405
2406 brw_set_dest(p, insn, dest);
2407 brw_set_src0(p, insn, src0);
2408 brw_set_src1(p, insn, brw_imm_d(0));
2409
2410 if (devinfo->gen < 6)
2411 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2412
2413 brw_set_urb_message(p,
2414 insn,
2415 flags,
2416 msg_length,
2417 response_length,
2418 offset,
2419 swizzle);
2420 }
2421
2422 void
2423 brw_send_indirect_message(struct brw_codegen *p,
2424 unsigned sfid,
2425 struct brw_reg dst,
2426 struct brw_reg payload,
2427 struct brw_reg desc,
2428 unsigned desc_imm)
2429 {
2430 const struct gen_device_info *devinfo = p->devinfo;
2431 struct brw_inst *send;
2432
2433 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2434
2435 assert(desc.type == BRW_REGISTER_TYPE_UD);
2436
2437 if (desc.file == BRW_IMMEDIATE_VALUE) {
2438 send = next_insn(p, BRW_OPCODE_SEND);
2439 brw_set_desc(p, send, desc.ud | desc_imm);
2440
2441 } else {
2442 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2443
2444 brw_push_insn_state(p);
2445 brw_set_default_access_mode(p, BRW_ALIGN_1);
2446 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2447 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2448 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2449
2450 /* Load the indirect descriptor to an address register using OR so the
2451 * caller can specify additional descriptor bits with the desc_imm
2452 * immediate.
2453 */
2454 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2455
2456 brw_pop_insn_state(p);
2457
2458 send = next_insn(p, BRW_OPCODE_SEND);
2459 brw_set_src1(p, send, addr);
2460 }
2461
2462 if (dst.width < BRW_EXECUTE_8)
2463 brw_inst_set_exec_size(devinfo, send, dst.width);
2464
2465 brw_set_dest(p, send, dst);
2466 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2467 brw_inst_set_sfid(devinfo, send, sfid);
2468 }
2469
2470 static void
2471 brw_send_indirect_surface_message(struct brw_codegen *p,
2472 unsigned sfid,
2473 struct brw_reg dst,
2474 struct brw_reg payload,
2475 struct brw_reg surface,
2476 unsigned desc_imm)
2477 {
2478 if (surface.file != BRW_IMMEDIATE_VALUE) {
2479 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2480
2481 brw_push_insn_state(p);
2482 brw_set_default_access_mode(p, BRW_ALIGN_1);
2483 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2484 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2485 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2486
2487 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2488 * some surface array is accessed out of bounds.
2489 */
2490 brw_AND(p, addr,
2491 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2492 BRW_GET_SWZ(surface.swizzle, 0)),
2493 brw_imm_ud(0xff));
2494
2495 brw_pop_insn_state(p);
2496
2497 surface = addr;
2498 }
2499
2500 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm);
2501 }
2502
2503 static bool
2504 while_jumps_before_offset(const struct gen_device_info *devinfo,
2505 brw_inst *insn, int while_offset, int start_offset)
2506 {
2507 int scale = 16 / brw_jump_scale(devinfo);
2508 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2509 : brw_inst_jip(devinfo, insn);
2510 assert(jip < 0);
2511 return while_offset + jip * scale <= start_offset;
2512 }
2513
2514
2515 static int
2516 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2517 {
2518 int offset;
2519 void *store = p->store;
2520 const struct gen_device_info *devinfo = p->devinfo;
2521
2522 int depth = 0;
2523
2524 for (offset = next_offset(devinfo, store, start_offset);
2525 offset < p->next_insn_offset;
2526 offset = next_offset(devinfo, store, offset)) {
2527 brw_inst *insn = store + offset;
2528
2529 switch (brw_inst_opcode(devinfo, insn)) {
2530 case BRW_OPCODE_IF:
2531 depth++;
2532 break;
2533 case BRW_OPCODE_ENDIF:
2534 if (depth == 0)
2535 return offset;
2536 depth--;
2537 break;
2538 case BRW_OPCODE_WHILE:
2539 /* If the while doesn't jump before our instruction, it's the end
2540 * of a sibling do...while loop. Ignore it.
2541 */
2542 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2543 continue;
2544 /* fallthrough */
2545 case BRW_OPCODE_ELSE:
2546 case BRW_OPCODE_HALT:
2547 if (depth == 0)
2548 return offset;
2549 }
2550 }
2551
2552 return 0;
2553 }
2554
2555 /* There is no DO instruction on gen6, so to find the end of the loop
2556 * we have to see if the loop is jumping back before our start
2557 * instruction.
2558 */
2559 static int
2560 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2561 {
2562 const struct gen_device_info *devinfo = p->devinfo;
2563 int offset;
2564 void *store = p->store;
2565
2566 assert(devinfo->gen >= 6);
2567
2568 /* Always start after the instruction (such as a WHILE) we're trying to fix
2569 * up.
2570 */
2571 for (offset = next_offset(devinfo, store, start_offset);
2572 offset < p->next_insn_offset;
2573 offset = next_offset(devinfo, store, offset)) {
2574 brw_inst *insn = store + offset;
2575
2576 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2577 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2578 return offset;
2579 }
2580 }
2581 assert(!"not reached");
2582 return start_offset;
2583 }
2584
2585 /* After program generation, go back and update the UIP and JIP of
2586 * BREAK, CONT, and HALT instructions to their correct locations.
2587 */
2588 void
2589 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2590 {
2591 const struct gen_device_info *devinfo = p->devinfo;
2592 int offset;
2593 int br = brw_jump_scale(devinfo);
2594 int scale = 16 / br;
2595 void *store = p->store;
2596
2597 if (devinfo->gen < 6)
2598 return;
2599
2600 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2601 brw_inst *insn = store + offset;
2602 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2603
2604 int block_end_offset = brw_find_next_block_end(p, offset);
2605 switch (brw_inst_opcode(devinfo, insn)) {
2606 case BRW_OPCODE_BREAK:
2607 assert(block_end_offset != 0);
2608 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2609 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2610 brw_inst_set_uip(devinfo, insn,
2611 (brw_find_loop_end(p, offset) - offset +
2612 (devinfo->gen == 6 ? 16 : 0)) / scale);
2613 break;
2614 case BRW_OPCODE_CONTINUE:
2615 assert(block_end_offset != 0);
2616 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2617 brw_inst_set_uip(devinfo, insn,
2618 (brw_find_loop_end(p, offset) - offset) / scale);
2619
2620 assert(brw_inst_uip(devinfo, insn) != 0);
2621 assert(brw_inst_jip(devinfo, insn) != 0);
2622 break;
2623
2624 case BRW_OPCODE_ENDIF: {
2625 int32_t jump = (block_end_offset == 0) ?
2626 1 * br : (block_end_offset - offset) / scale;
2627 if (devinfo->gen >= 7)
2628 brw_inst_set_jip(devinfo, insn, jump);
2629 else
2630 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2631 break;
2632 }
2633
2634 case BRW_OPCODE_HALT:
2635 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2636 *
2637 * "In case of the halt instruction not inside any conditional
2638 * code block, the value of <JIP> and <UIP> should be the
2639 * same. In case of the halt instruction inside conditional code
2640 * block, the <UIP> should be the end of the program, and the
2641 * <JIP> should be end of the most inner conditional code block."
2642 *
2643 * The uip will have already been set by whoever set up the
2644 * instruction.
2645 */
2646 if (block_end_offset == 0) {
2647 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2648 } else {
2649 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2650 }
2651 assert(brw_inst_uip(devinfo, insn) != 0);
2652 assert(brw_inst_jip(devinfo, insn) != 0);
2653 break;
2654 }
2655 }
2656 }
2657
2658 void brw_ff_sync(struct brw_codegen *p,
2659 struct brw_reg dest,
2660 unsigned msg_reg_nr,
2661 struct brw_reg src0,
2662 bool allocate,
2663 unsigned response_length,
2664 bool eot)
2665 {
2666 const struct gen_device_info *devinfo = p->devinfo;
2667 brw_inst *insn;
2668
2669 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2670
2671 insn = next_insn(p, BRW_OPCODE_SEND);
2672 brw_set_dest(p, insn, dest);
2673 brw_set_src0(p, insn, src0);
2674 brw_set_src1(p, insn, brw_imm_d(0));
2675
2676 if (devinfo->gen < 6)
2677 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2678
2679 brw_set_ff_sync_message(p,
2680 insn,
2681 allocate,
2682 response_length,
2683 eot);
2684 }
2685
2686 /**
2687 * Emit the SEND instruction necessary to generate stream output data on Gen6
2688 * (for transform feedback).
2689 *
2690 * If send_commit_msg is true, this is the last piece of stream output data
2691 * from this thread, so send the data as a committed write. According to the
2692 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2693 *
2694 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2695 * writes are complete by sending the final write as a committed write."
2696 */
2697 void
2698 brw_svb_write(struct brw_codegen *p,
2699 struct brw_reg dest,
2700 unsigned msg_reg_nr,
2701 struct brw_reg src0,
2702 unsigned binding_table_index,
2703 bool send_commit_msg)
2704 {
2705 const struct gen_device_info *devinfo = p->devinfo;
2706 const unsigned target_cache =
2707 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2708 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2709 BRW_SFID_DATAPORT_WRITE);
2710 brw_inst *insn;
2711
2712 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2713
2714 insn = next_insn(p, BRW_OPCODE_SEND);
2715 brw_inst_set_sfid(devinfo, insn, target_cache);
2716 brw_set_dest(p, insn, dest);
2717 brw_set_src0(p, insn, src0);
2718 brw_set_desc(p, insn,
2719 brw_message_desc(devinfo, 1, send_commit_msg, true) |
2720 brw_dp_write_desc(devinfo, binding_table_index,
2721 0, /* msg_control: ignored */
2722 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2723 0, /* last_render_target: ignored */
2724 send_commit_msg)); /* send_commit_msg */
2725 }
2726
2727 static unsigned
2728 brw_surface_payload_size(struct brw_codegen *p,
2729 unsigned num_channels,
2730 bool has_simd4x2,
2731 bool has_simd16)
2732 {
2733 if (has_simd4x2 && brw_get_default_access_mode(p) == BRW_ALIGN_16)
2734 return 1;
2735 else if (has_simd16 && brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2736 return 2 * num_channels;
2737 else
2738 return num_channels;
2739 }
2740
2741 static uint32_t
2742 brw_dp_untyped_atomic_desc(struct brw_codegen *p,
2743 unsigned atomic_op,
2744 bool response_expected)
2745 {
2746 const struct gen_device_info *devinfo = p->devinfo;
2747 unsigned msg_control =
2748 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2749 (response_expected ? 1 << 5 : 0); /* Return data expected */
2750 unsigned msg_type;
2751
2752 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2753 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2754 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2755 msg_control |= 1 << 4; /* SIMD8 mode */
2756
2757 msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2758 } else {
2759 msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2760 }
2761 } else {
2762 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2763 msg_control |= 1 << 4; /* SIMD8 mode */
2764
2765 msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2766 }
2767
2768 return brw_dp_surface_desc(devinfo, msg_type, msg_control);
2769 }
2770
2771 void
2772 brw_untyped_atomic(struct brw_codegen *p,
2773 struct brw_reg dst,
2774 struct brw_reg payload,
2775 struct brw_reg surface,
2776 unsigned atomic_op,
2777 unsigned msg_length,
2778 bool response_expected,
2779 bool header_present)
2780 {
2781 const struct gen_device_info *devinfo = p->devinfo;
2782 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2783 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2784 GEN7_SFID_DATAPORT_DATA_CACHE);
2785 const unsigned response_length = brw_surface_payload_size(
2786 p, response_expected, devinfo->gen >= 8 || devinfo->is_haswell, true);
2787 const unsigned desc =
2788 brw_message_desc(devinfo, msg_length, response_length, header_present) |
2789 brw_dp_untyped_atomic_desc(p, atomic_op, response_expected);
2790 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2791 /* Mask out unused components -- This is especially important in Align16
2792 * mode on generations that don't have native support for SIMD4x2 atomics,
2793 * because unused but enabled components will cause the dataport to perform
2794 * additional atomic operations on the addresses that happen to be in the
2795 * uninitialized Y, Z and W coordinates of the payload.
2796 */
2797 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2798
2799 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
2800 payload, surface, desc);
2801 }
2802
2803 static uint32_t
2804 brw_dp_untyped_atomic_float_desc(struct brw_codegen *p,
2805 unsigned atomic_op,
2806 bool response_expected)
2807 {
2808 const struct gen_device_info *devinfo = p->devinfo;
2809 const unsigned msg_type = GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP;
2810 unsigned msg_control =
2811 atomic_op | /* Atomic Operation Type: BRW_AOP_F* */
2812 (response_expected ? 1 << 5 : 0); /* Return data expected */
2813
2814 assert(devinfo->gen >= 9);
2815 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
2816
2817 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2818 msg_control |= 1 << 4; /* SIMD8 mode */
2819
2820 return brw_dp_surface_desc(devinfo, msg_type, msg_control);
2821 }
2822
2823 void
2824 brw_untyped_atomic_float(struct brw_codegen *p,
2825 struct brw_reg dst,
2826 struct brw_reg payload,
2827 struct brw_reg surface,
2828 unsigned atomic_op,
2829 unsigned msg_length,
2830 bool response_expected,
2831 bool header_present)
2832 {
2833 const struct gen_device_info *devinfo = p->devinfo;
2834
2835 assert(devinfo->gen >= 9);
2836 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
2837
2838 const unsigned sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2839 const unsigned response_length = brw_surface_payload_size(
2840 p, response_expected, true, true);
2841 const unsigned desc =
2842 brw_message_desc(devinfo, msg_length, response_length, header_present) |
2843 brw_dp_untyped_atomic_float_desc(p, atomic_op, response_expected);
2844
2845 brw_send_indirect_surface_message(p, sfid,
2846 brw_writemask(dst, WRITEMASK_XYZW),
2847 payload, surface, desc);
2848 }
2849
2850 static uint32_t
2851 brw_dp_untyped_surface_read_desc(struct brw_codegen *p,
2852 unsigned num_channels)
2853 {
2854 const struct gen_device_info *devinfo = p->devinfo;
2855 const unsigned msg_type = (devinfo->gen >= 8 || devinfo->is_haswell ?
2856 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2857 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ);
2858 /* Set mask of 32-bit channels to drop. */
2859 unsigned msg_control = 0xf & (0xf << num_channels);
2860
2861 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2862 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2863 msg_control |= 1 << 4; /* SIMD16 mode */
2864 else
2865 msg_control |= 2 << 4; /* SIMD8 mode */
2866 }
2867
2868 return brw_dp_surface_desc(devinfo, msg_type, msg_control);
2869 }
2870
2871 void
2872 brw_untyped_surface_read(struct brw_codegen *p,
2873 struct brw_reg dst,
2874 struct brw_reg payload,
2875 struct brw_reg surface,
2876 unsigned msg_length,
2877 unsigned num_channels)
2878 {
2879 const struct gen_device_info *devinfo = p->devinfo;
2880 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2881 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2882 GEN7_SFID_DATAPORT_DATA_CACHE);
2883 const unsigned response_length =
2884 brw_surface_payload_size(p, num_channels, true, true);
2885 const unsigned desc =
2886 brw_message_desc(devinfo, msg_length, response_length, false) |
2887 brw_dp_untyped_surface_read_desc(p, num_channels);
2888
2889 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
2890 }
2891
2892 static uint32_t
2893 brw_dp_untyped_surface_write_desc(struct brw_codegen *p,
2894 unsigned num_channels)
2895 {
2896 const struct gen_device_info *devinfo = p->devinfo;
2897 const unsigned msg_type = (devinfo->gen >= 8 || devinfo->is_haswell ?
2898 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
2899 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
2900 /* Set mask of 32-bit channels to drop. */
2901 unsigned msg_control = 0xf & (0xf << num_channels);
2902
2903 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2904 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2905 msg_control |= 1 << 4; /* SIMD16 mode */
2906 else
2907 msg_control |= 2 << 4; /* SIMD8 mode */
2908 } else {
2909 if (devinfo->gen >= 8 || devinfo->is_haswell)
2910 msg_control |= 0 << 4; /* SIMD4x2 mode */
2911 else
2912 msg_control |= 2 << 4; /* SIMD8 mode */
2913 }
2914
2915 return brw_dp_surface_desc(devinfo, msg_type, msg_control);
2916 }
2917
2918 void
2919 brw_untyped_surface_write(struct brw_codegen *p,
2920 struct brw_reg payload,
2921 struct brw_reg surface,
2922 unsigned msg_length,
2923 unsigned num_channels,
2924 bool header_present)
2925 {
2926 const struct gen_device_info *devinfo = p->devinfo;
2927 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2928 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2929 GEN7_SFID_DATAPORT_DATA_CACHE);
2930 const unsigned desc =
2931 brw_message_desc(devinfo, msg_length, 0, header_present) |
2932 brw_dp_untyped_surface_write_desc(p, num_channels);
2933 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2934 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2935 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
2936 WRITEMASK_X : WRITEMASK_XYZW;
2937
2938 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
2939 payload, surface, desc);
2940 }
2941
2942 static unsigned
2943 brw_byte_scattered_data_element_from_bit_size(unsigned bit_size)
2944 {
2945 switch (bit_size) {
2946 case 8:
2947 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_BYTE;
2948 case 16:
2949 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_WORD;
2950 case 32:
2951 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_DWORD;
2952 default:
2953 unreachable("Unsupported bit_size for byte scattered messages");
2954 }
2955 }
2956
2957 static uint32_t
2958 brw_dp_byte_scattered_desc(struct brw_codegen *p, unsigned bit_size,
2959 unsigned msg_type)
2960 {
2961 const struct gen_device_info *devinfo = p->devinfo;
2962 unsigned msg_control =
2963 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
2964
2965 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2966 msg_control |= 1; /* SIMD16 mode */
2967 else
2968 msg_control |= 0; /* SIMD8 mode */
2969
2970 return brw_dp_surface_desc(devinfo, msg_type, msg_control);
2971 }
2972
2973 void
2974 brw_byte_scattered_read(struct brw_codegen *p,
2975 struct brw_reg dst,
2976 struct brw_reg payload,
2977 struct brw_reg surface,
2978 unsigned msg_length,
2979 unsigned bit_size)
2980 {
2981 const struct gen_device_info *devinfo = p->devinfo;
2982 assert(devinfo->gen > 7 || devinfo->is_haswell);
2983 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
2984 const unsigned response_length =
2985 brw_surface_payload_size(p, 1, true, true);
2986 const unsigned desc =
2987 brw_message_desc(devinfo, msg_length, response_length, false) |
2988 brw_dp_byte_scattered_desc(p, bit_size,
2989 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
2990
2991 brw_send_indirect_surface_message(p, GEN7_SFID_DATAPORT_DATA_CACHE,
2992 dst, payload, surface, desc);
2993 }
2994
2995 void
2996 brw_byte_scattered_write(struct brw_codegen *p,
2997 struct brw_reg payload,
2998 struct brw_reg surface,
2999 unsigned msg_length,
3000 unsigned bit_size,
3001 bool header_present)
3002 {
3003 const struct gen_device_info *devinfo = p->devinfo;
3004 assert(devinfo->gen > 7 || devinfo->is_haswell);
3005 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3006 const unsigned desc =
3007 brw_message_desc(devinfo, msg_length, 0, header_present) |
3008 brw_dp_byte_scattered_desc(p, bit_size,
3009 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
3010
3011 brw_send_indirect_surface_message(p, GEN7_SFID_DATAPORT_DATA_CACHE,
3012 brw_writemask(brw_null_reg(),
3013 WRITEMASK_XYZW),
3014 payload, surface, desc);
3015 }
3016
3017 static uint32_t
3018 brw_dp_typed_atomic_desc(struct brw_codegen *p,
3019 unsigned atomic_op,
3020 bool response_expected)
3021 {
3022 const struct gen_device_info *devinfo = p->devinfo;
3023 unsigned msg_control =
3024 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3025 (response_expected ? 1 << 5 : 0); /* Return data expected */
3026 unsigned msg_type;
3027
3028 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3029 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3030 if ((brw_get_default_group(p) / 8) % 2 == 1)
3031 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3032
3033 msg_type = HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP;
3034 } else {
3035 msg_type = HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2;
3036 }
3037
3038 } else {
3039 if ((brw_get_default_group(p) / 8) % 2 == 1)
3040 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3041
3042 msg_type = GEN7_DATAPORT_RC_TYPED_ATOMIC_OP;
3043 }
3044
3045 return brw_dp_surface_desc(devinfo, msg_type, msg_control);
3046 }
3047
3048 void
3049 brw_typed_atomic(struct brw_codegen *p,
3050 struct brw_reg dst,
3051 struct brw_reg payload,
3052 struct brw_reg surface,
3053 unsigned atomic_op,
3054 unsigned msg_length,
3055 bool response_expected,
3056 bool header_present) {
3057 const struct gen_device_info *devinfo = p->devinfo;
3058 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3059 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3060 GEN6_SFID_DATAPORT_RENDER_CACHE);
3061 const unsigned response_length = brw_surface_payload_size(
3062 p, response_expected, devinfo->gen >= 8 || devinfo->is_haswell, false);
3063 const unsigned desc =
3064 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3065 brw_dp_typed_atomic_desc(p, atomic_op, response_expected);
3066 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3067 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3068 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3069
3070 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3071 payload, surface, desc);
3072 }
3073
3074 static uint32_t
3075 brw_dp_typed_surface_read_desc(struct brw_codegen *p,
3076 unsigned num_channels)
3077 {
3078 const struct gen_device_info *devinfo = p->devinfo;
3079 /* Set mask of unused channels. */
3080 unsigned msg_control = 0xf & (0xf << num_channels);
3081 unsigned msg_type;
3082
3083 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3084 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3085 if ((brw_get_default_group(p) / 8) % 2 == 1)
3086 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3087 else
3088 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3089 }
3090
3091 msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ;
3092 } else {
3093 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3094 if ((brw_get_default_group(p) / 8) % 2 == 1)
3095 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3096 }
3097
3098 msg_type = GEN7_DATAPORT_RC_TYPED_SURFACE_READ;
3099 }
3100
3101 return brw_dp_surface_desc(devinfo, msg_type, msg_control);
3102 }
3103
3104 void
3105 brw_typed_surface_read(struct brw_codegen *p,
3106 struct brw_reg dst,
3107 struct brw_reg payload,
3108 struct brw_reg surface,
3109 unsigned msg_length,
3110 unsigned num_channels,
3111 bool header_present)
3112 {
3113 const struct gen_device_info *devinfo = p->devinfo;
3114 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3115 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3116 GEN6_SFID_DATAPORT_RENDER_CACHE);
3117 const unsigned response_length = brw_surface_payload_size(
3118 p, num_channels, devinfo->gen >= 8 || devinfo->is_haswell, false);
3119 const unsigned desc =
3120 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3121 brw_dp_typed_surface_read_desc(p, num_channels);
3122
3123 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3124 }
3125
3126 static uint32_t
3127 brw_dp_typed_surface_write_desc(struct brw_codegen *p,
3128 unsigned num_channels)
3129 {
3130 const struct gen_device_info *devinfo = p->devinfo;
3131 /* Set mask of unused channels. */
3132 unsigned msg_control = 0xf & (0xf << num_channels);
3133 unsigned msg_type;
3134
3135 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3136 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3137 if ((brw_get_default_group(p) / 8) % 2 == 1)
3138 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3139 else
3140 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3141 }
3142
3143 msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE;
3144
3145 } else {
3146 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3147 if ((brw_get_default_group(p) / 8) % 2 == 1)
3148 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3149 }
3150
3151 msg_type = GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE;
3152 }
3153
3154 return brw_dp_surface_desc(devinfo, msg_type, msg_control);
3155 }
3156
3157 void
3158 brw_typed_surface_write(struct brw_codegen *p,
3159 struct brw_reg payload,
3160 struct brw_reg surface,
3161 unsigned msg_length,
3162 unsigned num_channels,
3163 bool header_present)
3164 {
3165 const struct gen_device_info *devinfo = p->devinfo;
3166 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3167 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3168 GEN6_SFID_DATAPORT_RENDER_CACHE);
3169 const unsigned desc =
3170 brw_message_desc(devinfo, msg_length, 0, header_present) |
3171 brw_dp_typed_surface_write_desc(p, num_channels);
3172 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3173 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3174 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3175 WRITEMASK_X : WRITEMASK_XYZW);
3176
3177 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3178 payload, surface, desc);
3179 }
3180
3181 static void
3182 brw_set_memory_fence_message(struct brw_codegen *p,
3183 struct brw_inst *insn,
3184 enum brw_message_target sfid,
3185 bool commit_enable)
3186 {
3187 const struct gen_device_info *devinfo = p->devinfo;
3188
3189 brw_set_desc(p, insn, brw_message_desc(
3190 devinfo, 1, (commit_enable ? 1 : 0), true));
3191
3192 brw_inst_set_sfid(devinfo, insn, sfid);
3193
3194 switch (sfid) {
3195 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3196 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3197 break;
3198 case GEN7_SFID_DATAPORT_DATA_CACHE:
3199 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3200 break;
3201 default:
3202 unreachable("Not reached");
3203 }
3204
3205 if (commit_enable)
3206 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3207 }
3208
3209 void
3210 brw_memory_fence(struct brw_codegen *p,
3211 struct brw_reg dst,
3212 enum opcode send_op)
3213 {
3214 const struct gen_device_info *devinfo = p->devinfo;
3215 const bool commit_enable =
3216 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3217 (devinfo->gen == 7 && !devinfo->is_haswell);
3218 struct brw_inst *insn;
3219
3220 brw_push_insn_state(p);
3221 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3222 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3223 dst = vec1(dst);
3224
3225 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3226 * message doesn't write anything back.
3227 */
3228 insn = next_insn(p, send_op);
3229 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3230 brw_set_dest(p, insn, dst);
3231 brw_set_src0(p, insn, dst);
3232 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3233 commit_enable);
3234
3235 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3236 /* IVB does typed surface access through the render cache, so we need to
3237 * flush it too. Use a different register so both flushes can be
3238 * pipelined by the hardware.
3239 */
3240 insn = next_insn(p, send_op);
3241 brw_set_dest(p, insn, offset(dst, 1));
3242 brw_set_src0(p, insn, offset(dst, 1));
3243 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3244 commit_enable);
3245
3246 /* Now write the response of the second message into the response of the
3247 * first to trigger a pipeline stall -- This way future render and data
3248 * cache messages will be properly ordered with respect to past data and
3249 * render cache messages.
3250 */
3251 brw_MOV(p, dst, offset(dst, 1));
3252 }
3253
3254 brw_pop_insn_state(p);
3255 }
3256
3257 void
3258 brw_pixel_interpolator_query(struct brw_codegen *p,
3259 struct brw_reg dest,
3260 struct brw_reg mrf,
3261 bool noperspective,
3262 unsigned mode,
3263 struct brw_reg data,
3264 unsigned msg_length,
3265 unsigned response_length)
3266 {
3267 const struct gen_device_info *devinfo = p->devinfo;
3268 const uint16_t exec_size = brw_get_default_exec_size(p);
3269 const unsigned slot_group = brw_get_default_group(p) / 16;
3270 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3271 const unsigned desc =
3272 brw_message_desc(devinfo, msg_length, response_length, false) |
3273 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3274 slot_group);
3275
3276 /* brw_send_indirect_message will automatically use a direct send message
3277 * if data is actually immediate.
3278 */
3279 brw_send_indirect_message(p,
3280 GEN7_SFID_PIXEL_INTERPOLATOR,
3281 dest,
3282 mrf,
3283 vec1(data),
3284 desc);
3285 }
3286
3287 void
3288 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3289 struct brw_reg mask)
3290 {
3291 const struct gen_device_info *devinfo = p->devinfo;
3292 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3293 const unsigned qtr_control = brw_get_default_group(p) / 8;
3294 brw_inst *inst;
3295
3296 assert(devinfo->gen >= 7);
3297 assert(mask.type == BRW_REGISTER_TYPE_UD);
3298
3299 brw_push_insn_state(p);
3300
3301 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3302 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3303
3304 if (devinfo->gen >= 8) {
3305 /* Getting the first active channel index is easy on Gen8: Just find
3306 * the first bit set in the execution mask. The register exists on
3307 * HSW already but it reads back as all ones when the current
3308 * instruction has execution masking disabled, so it's kind of
3309 * useless.
3310 */
3311 struct brw_reg exec_mask =
3312 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3313
3314 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3315 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3316 /* Unfortunately, ce0 does not take into account the thread
3317 * dispatch mask, which may be a problem in cases where it's not
3318 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3319 * some n). Combine ce0 with the given dispatch (or vector) mask
3320 * to mask off those channels which were never dispatched by the
3321 * hardware.
3322 */
3323 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3324 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3325 exec_mask = vec1(dst);
3326 }
3327
3328 /* Quarter control has the effect of magically shifting the value of
3329 * ce0 so you'll get the first active channel relative to the
3330 * specified quarter control as result.
3331 */
3332 inst = brw_FBL(p, vec1(dst), exec_mask);
3333 } else {
3334 const struct brw_reg flag = brw_flag_reg(p->current->flag_subreg / 2,
3335 p->current->flag_subreg % 2);
3336
3337 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3338 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3339
3340 /* Run enough instructions returning zero with execution masking and
3341 * a conditional modifier enabled in order to get the full execution
3342 * mask in f1.0. We could use a single 32-wide move here if it
3343 * weren't because of the hardware bug that causes channel enables to
3344 * be applied incorrectly to the second half of 32-wide instructions
3345 * on Gen7.
3346 */
3347 const unsigned lower_size = MIN2(16, exec_size);
3348 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3349 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3350 brw_imm_uw(0));
3351 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3352 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3353 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3354 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3355 }
3356
3357 /* Find the first bit set in the exec_size-wide portion of the flag
3358 * register that was updated by the last sequence of MOV
3359 * instructions.
3360 */
3361 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3362 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3363 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3364 }
3365 } else {
3366 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3367
3368 if (devinfo->gen >= 8 &&
3369 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3370 /* In SIMD4x2 mode the first active channel index is just the
3371 * negation of the first bit of the mask register. Note that ce0
3372 * doesn't take into account the dispatch mask, so the Gen7 path
3373 * should be used instead unless you have the guarantee that the
3374 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3375 * for some n).
3376 */
3377 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3378 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3379 brw_imm_ud(1));
3380
3381 } else {
3382 /* Overwrite the destination without and with execution masking to
3383 * find out which of the channels is active.
3384 */
3385 brw_push_insn_state(p);
3386 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3387 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3388 brw_imm_ud(1));
3389
3390 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3391 brw_imm_ud(0));
3392 brw_pop_insn_state(p);
3393 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3394 }
3395 }
3396
3397 brw_pop_insn_state(p);
3398 }
3399
3400 void
3401 brw_broadcast(struct brw_codegen *p,
3402 struct brw_reg dst,
3403 struct brw_reg src,
3404 struct brw_reg idx)
3405 {
3406 const struct gen_device_info *devinfo = p->devinfo;
3407 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3408 brw_inst *inst;
3409
3410 brw_push_insn_state(p);
3411 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3412 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3413
3414 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3415 src.address_mode == BRW_ADDRESS_DIRECT);
3416 assert(!src.abs && !src.negate);
3417 assert(src.type == dst.type);
3418
3419 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3420 idx.file == BRW_IMMEDIATE_VALUE) {
3421 /* Trivial, the source is already uniform or the index is a constant.
3422 * We will typically not get here if the optimizer is doing its job, but
3423 * asserting would be mean.
3424 */
3425 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3426 brw_MOV(p, dst,
3427 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3428 stride(suboffset(src, 4 * i), 0, 4, 1)));
3429 } else {
3430 /* From the Haswell PRM section "Register Region Restrictions":
3431 *
3432 * "The lower bits of the AddressImmediate must not overflow to
3433 * change the register address. The lower 5 bits of Address
3434 * Immediate when added to lower 5 bits of address register gives
3435 * the sub-register offset. The upper bits of Address Immediate
3436 * when added to upper bits of address register gives the register
3437 * address. Any overflow from sub-register offset is dropped."
3438 *
3439 * Fortunately, for broadcast, we never have a sub-register offset so
3440 * this isn't an issue.
3441 */
3442 assert(src.subnr == 0);
3443
3444 if (align1) {
3445 const struct brw_reg addr =
3446 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3447 unsigned offset = src.nr * REG_SIZE + src.subnr;
3448 /* Limit in bytes of the signed indirect addressing immediate. */
3449 const unsigned limit = 512;
3450
3451 brw_push_insn_state(p);
3452 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3453 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3454
3455 /* Take into account the component size and horizontal stride. */
3456 assert(src.vstride == src.hstride + src.width);
3457 brw_SHL(p, addr, vec1(idx),
3458 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3459 src.hstride - 1));
3460
3461 /* We can only address up to limit bytes using the indirect
3462 * addressing immediate, account for the difference if the source
3463 * register is above this limit.
3464 */
3465 if (offset >= limit) {
3466 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3467 offset = offset % limit;
3468 }
3469
3470 brw_pop_insn_state(p);
3471
3472 /* Use indirect addressing to fetch the specified component. */
3473 if (type_sz(src.type) > 4 &&
3474 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3475 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3476 *
3477 * "When source or destination datatype is 64b or operation is
3478 * integer DWord multiply, indirect addressing must not be
3479 * used."
3480 *
3481 * To work around both of this issue, we do two integer MOVs
3482 * insead of one 64-bit MOV. Because no double value should ever
3483 * cross a register boundary, it's safe to use the immediate
3484 * offset in the indirect here to handle adding 4 bytes to the
3485 * offset and avoid the extra ADD to the register file.
3486 */
3487 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3488 retype(brw_vec1_indirect(addr.subnr, offset),
3489 BRW_REGISTER_TYPE_D));
3490 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3491 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3492 BRW_REGISTER_TYPE_D));
3493 } else {
3494 brw_MOV(p, dst,
3495 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3496 }
3497 } else {
3498 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3499 * to all bits of a flag register,
3500 */
3501 inst = brw_MOV(p,
3502 brw_null_reg(),
3503 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3504 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3505 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3506 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3507
3508 /* and use predicated SEL to pick the right channel. */
3509 inst = brw_SEL(p, dst,
3510 stride(suboffset(src, 4), 4, 4, 1),
3511 stride(src, 4, 4, 1));
3512 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3513 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3514 }
3515 }
3516
3517 brw_pop_insn_state(p);
3518 }
3519
3520 /**
3521 * This instruction is generated as a single-channel align1 instruction by
3522 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3523 *
3524 * We can't use the typed atomic op in the FS because that has the execution
3525 * mask ANDed with the pixel mask, but we just want to write the one dword for
3526 * all the pixels.
3527 *
3528 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3529 * one u32. So we use the same untyped atomic write message as the pixel
3530 * shader.
3531 *
3532 * The untyped atomic operation requires a BUFFER surface type with RAW
3533 * format, and is only accessible through the legacy DATA_CACHE dataport
3534 * messages.
3535 */
3536 void brw_shader_time_add(struct brw_codegen *p,
3537 struct brw_reg payload,
3538 uint32_t surf_index)
3539 {
3540 const struct gen_device_info *devinfo = p->devinfo;
3541 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3542 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3543 GEN7_SFID_DATAPORT_DATA_CACHE);
3544 assert(devinfo->gen >= 7);
3545
3546 brw_push_insn_state(p);
3547 brw_set_default_access_mode(p, BRW_ALIGN_1);
3548 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3549 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3550 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3551
3552 /* We use brw_vec1_reg and unmasked because we want to increment the given
3553 * offset only once.
3554 */
3555 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3556 BRW_ARF_NULL, 0));
3557 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3558 payload.nr, 0));
3559 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3560 brw_dp_untyped_atomic_desc(p, BRW_AOP_ADD, false)));
3561
3562 brw_inst_set_sfid(devinfo, send, sfid);
3563 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3564
3565 brw_pop_insn_state(p);
3566 }
3567
3568
3569 /**
3570 * Emit the SEND message for a barrier
3571 */
3572 void
3573 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3574 {
3575 const struct gen_device_info *devinfo = p->devinfo;
3576 struct brw_inst *inst;
3577
3578 assert(devinfo->gen >= 7);
3579
3580 brw_push_insn_state(p);
3581 brw_set_default_access_mode(p, BRW_ALIGN_1);
3582 inst = next_insn(p, BRW_OPCODE_SEND);
3583 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3584 brw_set_src0(p, inst, src);
3585 brw_set_src1(p, inst, brw_null_reg());
3586 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3587
3588 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3589 brw_inst_set_gateway_notify(devinfo, inst, 1);
3590 brw_inst_set_gateway_subfuncid(devinfo, inst,
3591 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3592
3593 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3594 brw_pop_insn_state(p);
3595 }
3596
3597
3598 /**
3599 * Emit the wait instruction for a barrier
3600 */
3601 void
3602 brw_WAIT(struct brw_codegen *p)
3603 {
3604 const struct gen_device_info *devinfo = p->devinfo;
3605 struct brw_inst *insn;
3606
3607 struct brw_reg src = brw_notification_reg();
3608
3609 insn = next_insn(p, BRW_OPCODE_WAIT);
3610 brw_set_dest(p, insn, src);
3611 brw_set_src0(p, insn, src);
3612 brw_set_src1(p, insn, brw_null_reg());
3613
3614 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3615 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3616 }
3617
3618 /**
3619 * Changes the floating point rounding mode updating the control register
3620 * field defined at cr0.0[5-6] bits. This function supports the changes to
3621 * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3622 * Only RTNE and RTZ rounding are enabled at nir.
3623 */
3624 void
3625 brw_rounding_mode(struct brw_codegen *p,
3626 enum brw_rnd_mode mode)
3627 {
3628 const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
3629
3630 if (bits != BRW_CR0_RND_MODE_MASK) {
3631 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3632 brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
3633 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3634
3635 /* From the Skylake PRM, Volume 7, page 760:
3636 * "Implementation Restriction on Register Access: When the control
3637 * register is used as an explicit source and/or destination, hardware
3638 * does not ensure execution pipeline coherency. Software must set the
3639 * thread control field to ‘switch’ for an instruction that uses
3640 * control register as an explicit operand."
3641 */
3642 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3643 }
3644
3645 if (bits) {
3646 brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3647 brw_imm_ud(bits));
3648 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3649 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3650 }
3651 }