intel/eu: Copy fields manually in brw_next_insn
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 gen7_convert_mrf_to_grf(p, &dest);
98
99 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
100 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
101
102 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
103 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
104
105 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
106 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
107 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
108 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
109 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
110 } else {
111 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
112 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
113 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_MESSAGE_REGISTER_FILE) {
115 assert(dest.writemask != 0);
116 }
117 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
118 * Although Dst.HorzStride is a don't care for Align16, HW needs
119 * this to be programmed as "01".
120 */
121 brw_inst_set_dst_hstride(devinfo, inst, 1);
122 }
123 } else {
124 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
125
126 /* These are different sizes in align1 vs align16:
127 */
128 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
129 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
130 dest.indirect_offset);
131 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
134 } else {
135 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
136 dest.indirect_offset);
137 /* even ignored in da16, still need to set as '01' */
138 brw_inst_set_dst_hstride(devinfo, inst, 1);
139 }
140 }
141
142 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
143 * or 16 (SIMD16), as that's normally correct. However, when dealing with
144 * small registers, it can be useful for us to automatically reduce it to
145 * match the register size.
146 */
147 if (p->automatic_exec_sizes) {
148 /*
149 * In platforms that support fp64 we can emit instructions with a width
150 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
151 * these cases we need to make sure that these instructions have their
152 * exec sizes set properly when they are emitted and we can't rely on
153 * this code to fix it.
154 */
155 bool fix_exec_size;
156 if (devinfo->gen >= 6)
157 fix_exec_size = dest.width < BRW_EXECUTE_4;
158 else
159 fix_exec_size = dest.width < BRW_EXECUTE_8;
160
161 if (fix_exec_size)
162 brw_inst_set_exec_size(devinfo, inst, dest.width);
163 }
164 }
165
166 void
167 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
168 {
169 const struct gen_device_info *devinfo = p->devinfo;
170
171 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
172 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
173 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
174 assert(reg.nr < 128);
175
176 gen7_convert_mrf_to_grf(p, &reg);
177
178 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
179 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
180 /* Any source modifiers or regions will be ignored, since this just
181 * identifies the MRF/GRF to start reading the message contents from.
182 * Check for some likely failures.
183 */
184 assert(!reg.negate);
185 assert(!reg.abs);
186 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
187 }
188
189 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
190 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
191 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
192 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
193
194 if (reg.file == BRW_IMMEDIATE_VALUE) {
195 if (reg.type == BRW_REGISTER_TYPE_DF ||
196 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
197 brw_inst_set_imm_df(devinfo, inst, reg.df);
198 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
199 reg.type == BRW_REGISTER_TYPE_Q)
200 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
201 else
202 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
203
204 if (type_sz(reg.type) < 8) {
205 brw_inst_set_src1_reg_file(devinfo, inst,
206 BRW_ARCHITECTURE_REGISTER_FILE);
207 brw_inst_set_src1_reg_hw_type(devinfo, inst,
208 brw_inst_src0_reg_hw_type(devinfo, inst));
209 }
210 } else {
211 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
212 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
213 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
214 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
215 } else {
216 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
217 }
218 } else {
219 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
220
221 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
222 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
223 } else {
224 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
225 }
226 }
227
228 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
229 if (reg.width == BRW_WIDTH_1 &&
230 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
231 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
232 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
233 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
234 } else {
235 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
236 brw_inst_set_src0_width(devinfo, inst, reg.width);
237 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
238 }
239 } else {
240 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
241 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
242 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
243 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
244 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
245 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
246 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
247 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
248
249 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
250 /* This is an oddity of the fact we're using the same
251 * descriptions for registers in align_16 as align_1:
252 */
253 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
254 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
255 reg.type == BRW_REGISTER_TYPE_DF &&
256 reg.vstride == BRW_VERTICAL_STRIDE_2) {
257 /* From SNB PRM:
258 *
259 * "For Align16 access mode, only encodings of 0000 and 0011
260 * are allowed. Other codes are reserved."
261 *
262 * Presumably the DevSNB behavior applies to IVB as well.
263 */
264 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
265 } else {
266 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
267 }
268 }
269 }
270 }
271
272
273 void
274 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
275 {
276 const struct gen_device_info *devinfo = p->devinfo;
277
278 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
279 assert(reg.nr < 128);
280
281 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
282 *
283 * "Accumulator registers may be accessed explicitly as src0
284 * operands only."
285 */
286 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
287 reg.nr != BRW_ARF_ACCUMULATOR);
288
289 gen7_convert_mrf_to_grf(p, &reg);
290 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
291
292 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
293 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
294 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
295
296 /* Only src1 can be immediate in two-argument instructions.
297 */
298 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
299
300 if (reg.file == BRW_IMMEDIATE_VALUE) {
301 /* two-argument instructions can only use 32-bit immediates */
302 assert(type_sz(reg.type) < 8);
303 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
304 } else {
305 /* This is a hardware restriction, which may or may not be lifted
306 * in the future:
307 */
308 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
309 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
310
311 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
312 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
313 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
314 } else {
315 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
316 }
317
318 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
319 if (reg.width == BRW_WIDTH_1 &&
320 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
321 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
322 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
323 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
324 } else {
325 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
326 brw_inst_set_src1_width(devinfo, inst, reg.width);
327 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
328 }
329 } else {
330 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
331 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
332 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
333 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
334 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
335 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
336 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
337 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
338
339 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
340 /* This is an oddity of the fact we're using the same
341 * descriptions for registers in align_16 as align_1:
342 */
343 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
344 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
345 reg.type == BRW_REGISTER_TYPE_DF &&
346 reg.vstride == BRW_VERTICAL_STRIDE_2) {
347 /* From SNB PRM:
348 *
349 * "For Align16 access mode, only encodings of 0000 and 0011
350 * are allowed. Other codes are reserved."
351 *
352 * Presumably the DevSNB behavior applies to IVB as well.
353 */
354 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
355 } else {
356 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
357 }
358 }
359 }
360 }
361
362 /**
363 * Set the Message Descriptor and Extended Message Descriptor fields
364 * for SEND messages.
365 *
366 * \note This zeroes out the Function Control bits, so it must be called
367 * \b before filling out any message-specific data. Callers can
368 * choose not to fill in irrelevant bits; they will be zero.
369 */
370 void
371 brw_set_message_descriptor(struct brw_codegen *p,
372 brw_inst *inst,
373 enum brw_message_target sfid,
374 unsigned msg_length,
375 unsigned response_length,
376 bool header_present,
377 bool end_of_thread)
378 {
379 const struct gen_device_info *devinfo = p->devinfo;
380
381 brw_set_src1(p, inst, brw_imm_d(0));
382
383 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
384 * itself; instead, it will be a MOV/OR into the address register.
385 *
386 * In this case, we avoid setting the extended message descriptor bits,
387 * since they go on the later SEND/SENDC instead and if set here would
388 * instead clobber the conditionalmod bits.
389 */
390 unsigned opcode = brw_inst_opcode(devinfo, inst);
391 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
392 brw_inst_set_sfid(devinfo, inst, sfid);
393 }
394
395 brw_inst_set_mlen(devinfo, inst, msg_length);
396 brw_inst_set_rlen(devinfo, inst, response_length);
397 brw_inst_set_eot(devinfo, inst, end_of_thread);
398
399 if (devinfo->gen >= 5) {
400 brw_inst_set_header_present(devinfo, inst, header_present);
401 }
402 }
403
404 static void brw_set_math_message( struct brw_codegen *p,
405 brw_inst *inst,
406 unsigned function,
407 unsigned integer_type,
408 bool low_precision,
409 unsigned dataType )
410 {
411 const struct gen_device_info *devinfo = p->devinfo;
412 unsigned msg_length;
413 unsigned response_length;
414
415 /* Infer message length from the function */
416 switch (function) {
417 case BRW_MATH_FUNCTION_POW:
418 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
419 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
420 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
421 msg_length = 2;
422 break;
423 default:
424 msg_length = 1;
425 break;
426 }
427
428 /* Infer response length from the function */
429 switch (function) {
430 case BRW_MATH_FUNCTION_SINCOS:
431 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
432 response_length = 2;
433 break;
434 default:
435 response_length = 1;
436 break;
437 }
438
439
440 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
441 msg_length, response_length, false, false);
442 brw_inst_set_math_msg_function(devinfo, inst, function);
443 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
444 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
445 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
446 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
447 brw_inst_set_saturate(devinfo, inst, 0);
448 }
449
450
451 static void brw_set_ff_sync_message(struct brw_codegen *p,
452 brw_inst *insn,
453 bool allocate,
454 unsigned response_length,
455 bool end_of_thread)
456 {
457 const struct gen_device_info *devinfo = p->devinfo;
458
459 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
460 1, response_length, true, end_of_thread);
461 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
462 brw_inst_set_urb_allocate(devinfo, insn, allocate);
463 /* The following fields are not used by FF_SYNC: */
464 brw_inst_set_urb_global_offset(devinfo, insn, 0);
465 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
466 brw_inst_set_urb_used(devinfo, insn, 0);
467 brw_inst_set_urb_complete(devinfo, insn, 0);
468 }
469
470 static void brw_set_urb_message( struct brw_codegen *p,
471 brw_inst *insn,
472 enum brw_urb_write_flags flags,
473 unsigned msg_length,
474 unsigned response_length,
475 unsigned offset,
476 unsigned swizzle_control )
477 {
478 const struct gen_device_info *devinfo = p->devinfo;
479
480 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
481 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
482 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
483
484 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
485 msg_length, response_length, true,
486 flags & BRW_URB_WRITE_EOT);
487
488 if (flags & BRW_URB_WRITE_OWORD) {
489 assert(msg_length == 2); /* header + one OWORD of data */
490 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
491 } else {
492 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
493 }
494
495 brw_inst_set_urb_global_offset(devinfo, insn, offset);
496 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
497
498 if (devinfo->gen < 8) {
499 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
500 }
501
502 if (devinfo->gen < 7) {
503 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
504 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
505 } else {
506 brw_inst_set_urb_per_slot_offset(devinfo, insn,
507 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
508 }
509 }
510
511 void
512 brw_set_dp_write_message(struct brw_codegen *p,
513 brw_inst *insn,
514 unsigned binding_table_index,
515 unsigned msg_control,
516 unsigned msg_type,
517 unsigned target_cache,
518 unsigned msg_length,
519 bool header_present,
520 unsigned last_render_target,
521 unsigned response_length,
522 unsigned end_of_thread,
523 unsigned send_commit_msg)
524 {
525 const struct gen_device_info *devinfo = p->devinfo;
526 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
527 BRW_SFID_DATAPORT_WRITE);
528
529 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
530 header_present, end_of_thread);
531
532 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
533 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
534 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
535 brw_inst_set_rt_last(devinfo, insn, last_render_target);
536 if (devinfo->gen < 7) {
537 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
538 }
539
540 if (devinfo->gen >= 11)
541 brw_inst_set_null_rt(devinfo, insn, false);
542 }
543
544 void
545 brw_set_dp_read_message(struct brw_codegen *p,
546 brw_inst *insn,
547 unsigned binding_table_index,
548 unsigned msg_control,
549 unsigned msg_type,
550 unsigned target_cache,
551 unsigned msg_length,
552 bool header_present,
553 unsigned response_length)
554 {
555 const struct gen_device_info *devinfo = p->devinfo;
556 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
557 BRW_SFID_DATAPORT_READ);
558
559 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
560 header_present, false);
561
562 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
563 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
564 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
565 if (devinfo->gen < 6)
566 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
567 }
568
569 void
570 brw_set_sampler_message(struct brw_codegen *p,
571 brw_inst *inst,
572 unsigned binding_table_index,
573 unsigned sampler,
574 unsigned msg_type,
575 unsigned response_length,
576 unsigned msg_length,
577 unsigned header_present,
578 unsigned simd_mode,
579 unsigned return_format)
580 {
581 const struct gen_device_info *devinfo = p->devinfo;
582
583 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
584 response_length, header_present, false);
585
586 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
587 brw_inst_set_sampler(devinfo, inst, sampler);
588 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
589 if (devinfo->gen >= 5) {
590 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
591 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
592 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
593 }
594 }
595
596 static void
597 gen7_set_dp_scratch_message(struct brw_codegen *p,
598 brw_inst *inst,
599 bool write,
600 bool dword,
601 bool invalidate_after_read,
602 unsigned num_regs,
603 unsigned addr_offset,
604 unsigned mlen,
605 unsigned rlen,
606 bool header_present)
607 {
608 const struct gen_device_info *devinfo = p->devinfo;
609 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
610 (devinfo->gen >= 8 && num_regs == 8));
611 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
612 num_regs - 1);
613
614 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
615 mlen, rlen, header_present, false);
616 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
617 brw_inst_set_scratch_read_write(devinfo, inst, write);
618 brw_inst_set_scratch_type(devinfo, inst, dword);
619 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
620 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
621 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
622 }
623
624 struct brw_insn_state {
625 /* One of BRW_EXECUTE_* */
626 unsigned exec_size:3;
627
628 /* Group in units of channels */
629 unsigned group:5;
630
631 /* Compression control on gen4-5 */
632 bool compressed:1;
633
634 /* One of BRW_MASK_* */
635 unsigned mask_control:1;
636
637 bool saturate:1;
638
639 /* One of BRW_ALIGN_* */
640 unsigned access_mode:1;
641
642 /* One of BRW_PREDICATE_* */
643 enum brw_predicate predicate:4;
644
645 bool pred_inv:1;
646
647 /* Flag subreg. Bottom bit is subreg, top bit is reg */
648 unsigned flag_subreg:2;
649
650 bool acc_wr_control:1;
651 };
652
653 static struct brw_insn_state
654 brw_inst_get_state(const struct gen_device_info *devinfo,
655 const brw_inst *insn)
656 {
657 struct brw_insn_state state = { };
658
659 state.exec_size = brw_inst_exec_size(devinfo, insn);
660 if (devinfo->gen >= 6) {
661 state.group = brw_inst_qtr_control(devinfo, insn) * 8;
662 if (devinfo->gen >= 7)
663 state.group += brw_inst_nib_control(devinfo, insn) * 4;
664 } else {
665 unsigned qtr_control = brw_inst_qtr_control(devinfo, insn);
666 if (qtr_control == BRW_COMPRESSION_COMPRESSED) {
667 state.group = 0;
668 state.compressed = true;
669 } else {
670 state.group = qtr_control * 8;
671 state.compressed = false;
672 }
673 }
674 state.access_mode = brw_inst_access_mode(devinfo, insn);
675 state.mask_control = brw_inst_mask_control(devinfo, insn);
676 state.saturate = brw_inst_saturate(devinfo, insn);
677 state.predicate = brw_inst_pred_control(devinfo, insn);
678 state.pred_inv = brw_inst_pred_inv(devinfo, insn);
679
680 state.flag_subreg = brw_inst_flag_subreg_nr(devinfo, insn);
681 if (devinfo->gen >= 7)
682 state.flag_subreg += brw_inst_flag_reg_nr(devinfo, insn) * 2;
683
684 if (devinfo->gen >= 6)
685 state.acc_wr_control = brw_inst_acc_wr_control(devinfo, insn);
686
687 return state;
688 }
689
690 static void
691 brw_inst_set_state(const struct gen_device_info *devinfo,
692 brw_inst *insn,
693 const struct brw_insn_state *state)
694 {
695 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
696 brw_inst_set_group(devinfo, insn, state->group);
697 brw_inst_set_compression(devinfo, insn, state->compressed);
698 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
699 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
700 brw_inst_set_saturate(devinfo, insn, state->saturate);
701 brw_inst_set_pred_control(devinfo, insn, state->predicate);
702 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
703
704 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
705 if (devinfo->gen >= 7)
706 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
707
708 if (devinfo->gen >= 6)
709 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
710 }
711
712 #define next_insn brw_next_insn
713 brw_inst *
714 brw_next_insn(struct brw_codegen *p, unsigned opcode)
715 {
716 const struct gen_device_info *devinfo = p->devinfo;
717 brw_inst *insn;
718
719 if (p->nr_insn + 1 > p->store_size) {
720 p->store_size <<= 1;
721 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
722 }
723
724 p->next_insn_offset += 16;
725 insn = &p->store[p->nr_insn++];
726
727 memset(insn, 0, sizeof(*insn));
728 brw_inst_set_opcode(devinfo, insn, opcode);
729
730 /* Apply the default instruction state */
731 struct brw_insn_state current = brw_inst_get_state(devinfo, p->current);
732 brw_inst_set_state(devinfo, insn, &current);
733
734 return insn;
735 }
736
737 static brw_inst *
738 brw_alu1(struct brw_codegen *p, unsigned opcode,
739 struct brw_reg dest, struct brw_reg src)
740 {
741 brw_inst *insn = next_insn(p, opcode);
742 brw_set_dest(p, insn, dest);
743 brw_set_src0(p, insn, src);
744 return insn;
745 }
746
747 static brw_inst *
748 brw_alu2(struct brw_codegen *p, unsigned opcode,
749 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
750 {
751 /* 64-bit immediates are only supported on 1-src instructions */
752 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
753 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
754
755 brw_inst *insn = next_insn(p, opcode);
756 brw_set_dest(p, insn, dest);
757 brw_set_src0(p, insn, src0);
758 brw_set_src1(p, insn, src1);
759 return insn;
760 }
761
762 static int
763 get_3src_subreg_nr(struct brw_reg reg)
764 {
765 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
766 * use 32-bit units (components 0..7). Since they only support F/D/UD
767 * types, this doesn't lose any flexibility, but uses fewer bits.
768 */
769 return reg.subnr / 4;
770 }
771
772 static enum gen10_align1_3src_vertical_stride
773 to_3src_align1_vstride(enum brw_vertical_stride vstride)
774 {
775 switch (vstride) {
776 case BRW_VERTICAL_STRIDE_0:
777 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
778 case BRW_VERTICAL_STRIDE_2:
779 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
780 case BRW_VERTICAL_STRIDE_4:
781 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
782 case BRW_VERTICAL_STRIDE_8:
783 case BRW_VERTICAL_STRIDE_16:
784 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
785 default:
786 unreachable("invalid vstride");
787 }
788 }
789
790
791 static enum gen10_align1_3src_src_horizontal_stride
792 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
793 {
794 switch (hstride) {
795 case BRW_HORIZONTAL_STRIDE_0:
796 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
797 case BRW_HORIZONTAL_STRIDE_1:
798 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
799 case BRW_HORIZONTAL_STRIDE_2:
800 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
801 case BRW_HORIZONTAL_STRIDE_4:
802 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
803 default:
804 unreachable("invalid hstride");
805 }
806 }
807
808 static brw_inst *
809 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
810 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
811 {
812 const struct gen_device_info *devinfo = p->devinfo;
813 brw_inst *inst = next_insn(p, opcode);
814
815 gen7_convert_mrf_to_grf(p, &dest);
816
817 assert(dest.nr < 128);
818 assert(src0.nr < 128);
819 assert(src1.nr < 128);
820 assert(src2.nr < 128);
821 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
822 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
823 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
824 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
825
826 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
827 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
828 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
829
830 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
831 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
832 BRW_ALIGN1_3SRC_ACCUMULATOR);
833 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
834 } else {
835 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
836 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
837 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
838 }
839 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
840
841 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
842
843 if (brw_reg_type_is_floating_point(dest.type)) {
844 brw_inst_set_3src_a1_exec_type(devinfo, inst,
845 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
846 } else {
847 brw_inst_set_3src_a1_exec_type(devinfo, inst,
848 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
849 }
850
851 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
852 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
853 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
854 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
855
856 brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
857 to_3src_align1_vstride(src0.vstride));
858 brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
859 to_3src_align1_vstride(src1.vstride));
860 /* no vstride on src2 */
861
862 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
863 to_3src_align1_hstride(src0.hstride));
864 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
865 to_3src_align1_hstride(src1.hstride));
866 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
867 to_3src_align1_hstride(src2.hstride));
868
869 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
870 if (src0.type == BRW_REGISTER_TYPE_NF) {
871 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
872 } else {
873 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
874 }
875 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
876 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
877
878 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
879 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
880 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
881 } else {
882 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
883 }
884 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
885 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
886
887 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
888 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
889 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
890 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
891
892 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
893 src0.file == BRW_IMMEDIATE_VALUE ||
894 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
895 src0.type == BRW_REGISTER_TYPE_NF));
896 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
897 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
898 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
899 src2.file == BRW_IMMEDIATE_VALUE);
900
901 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
902 src0.file == BRW_GENERAL_REGISTER_FILE ?
903 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
904 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
905 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
906 src1.file == BRW_GENERAL_REGISTER_FILE ?
907 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
908 BRW_ALIGN1_3SRC_ACCUMULATOR);
909 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
910 src2.file == BRW_GENERAL_REGISTER_FILE ?
911 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
912 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
913 } else {
914 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
915 dest.file == BRW_MESSAGE_REGISTER_FILE);
916 assert(dest.type == BRW_REGISTER_TYPE_F ||
917 dest.type == BRW_REGISTER_TYPE_DF ||
918 dest.type == BRW_REGISTER_TYPE_D ||
919 dest.type == BRW_REGISTER_TYPE_UD);
920 if (devinfo->gen == 6) {
921 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
922 dest.file == BRW_MESSAGE_REGISTER_FILE);
923 }
924 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
925 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
926 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
927
928 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
929 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
930 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
931 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
932 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
933 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
934 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
935 src0.vstride == BRW_VERTICAL_STRIDE_0);
936
937 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
938 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
939 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
940 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
941 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
942 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
943 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
944 src1.vstride == BRW_VERTICAL_STRIDE_0);
945
946 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
947 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
948 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
949 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
950 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
951 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
952 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
953 src2.vstride == BRW_VERTICAL_STRIDE_0);
954
955 if (devinfo->gen >= 7) {
956 /* Set both the source and destination types based on dest.type,
957 * ignoring the source register types. The MAD and LRP emitters ensure
958 * that all four types are float. The BFE and BFI2 emitters, however,
959 * may send us mixed D and UD types and want us to ignore that and use
960 * the destination type.
961 */
962 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
963 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
964 }
965 }
966
967 return inst;
968 }
969
970
971 /***********************************************************************
972 * Convenience routines.
973 */
974 #define ALU1(OP) \
975 brw_inst *brw_##OP(struct brw_codegen *p, \
976 struct brw_reg dest, \
977 struct brw_reg src0) \
978 { \
979 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
980 }
981
982 #define ALU2(OP) \
983 brw_inst *brw_##OP(struct brw_codegen *p, \
984 struct brw_reg dest, \
985 struct brw_reg src0, \
986 struct brw_reg src1) \
987 { \
988 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
989 }
990
991 #define ALU3(OP) \
992 brw_inst *brw_##OP(struct brw_codegen *p, \
993 struct brw_reg dest, \
994 struct brw_reg src0, \
995 struct brw_reg src1, \
996 struct brw_reg src2) \
997 { \
998 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
999 }
1000
1001 #define ALU3F(OP) \
1002 brw_inst *brw_##OP(struct brw_codegen *p, \
1003 struct brw_reg dest, \
1004 struct brw_reg src0, \
1005 struct brw_reg src1, \
1006 struct brw_reg src2) \
1007 { \
1008 assert(dest.type == BRW_REGISTER_TYPE_F || \
1009 dest.type == BRW_REGISTER_TYPE_DF); \
1010 if (dest.type == BRW_REGISTER_TYPE_F) { \
1011 assert(src0.type == BRW_REGISTER_TYPE_F); \
1012 assert(src1.type == BRW_REGISTER_TYPE_F); \
1013 assert(src2.type == BRW_REGISTER_TYPE_F); \
1014 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1015 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1016 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1017 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1018 } \
1019 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1020 }
1021
1022 /* Rounding operations (other than RNDD) require two instructions - the first
1023 * stores a rounded value (possibly the wrong way) in the dest register, but
1024 * also sets a per-channel "increment bit" in the flag register. A predicated
1025 * add of 1.0 fixes dest to contain the desired result.
1026 *
1027 * Sandybridge and later appear to round correctly without an ADD.
1028 */
1029 #define ROUND(OP) \
1030 void brw_##OP(struct brw_codegen *p, \
1031 struct brw_reg dest, \
1032 struct brw_reg src) \
1033 { \
1034 const struct gen_device_info *devinfo = p->devinfo; \
1035 brw_inst *rnd, *add; \
1036 rnd = next_insn(p, BRW_OPCODE_##OP); \
1037 brw_set_dest(p, rnd, dest); \
1038 brw_set_src0(p, rnd, src); \
1039 \
1040 if (devinfo->gen < 6) { \
1041 /* turn on round-increments */ \
1042 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1043 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1044 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1045 } \
1046 }
1047
1048
1049 ALU2(SEL)
1050 ALU1(NOT)
1051 ALU2(AND)
1052 ALU2(OR)
1053 ALU2(XOR)
1054 ALU2(SHR)
1055 ALU2(SHL)
1056 ALU1(DIM)
1057 ALU2(ASR)
1058 ALU3(CSEL)
1059 ALU1(FRC)
1060 ALU1(RNDD)
1061 ALU2(MAC)
1062 ALU2(MACH)
1063 ALU1(LZD)
1064 ALU2(DP4)
1065 ALU2(DPH)
1066 ALU2(DP3)
1067 ALU2(DP2)
1068 ALU3(MAD)
1069 ALU3F(LRP)
1070 ALU1(BFREV)
1071 ALU3(BFE)
1072 ALU2(BFI1)
1073 ALU3(BFI2)
1074 ALU1(FBH)
1075 ALU1(FBL)
1076 ALU1(CBIT)
1077 ALU2(ADDC)
1078 ALU2(SUBB)
1079
1080 ROUND(RNDZ)
1081 ROUND(RNDE)
1082
1083 brw_inst *
1084 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1085 {
1086 const struct gen_device_info *devinfo = p->devinfo;
1087
1088 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1089 * To avoid the problems that causes, we use a <1,2,0> source region to read
1090 * each element twice.
1091 */
1092 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1093 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1094 dest.type == BRW_REGISTER_TYPE_DF &&
1095 (src0.type == BRW_REGISTER_TYPE_F ||
1096 src0.type == BRW_REGISTER_TYPE_D ||
1097 src0.type == BRW_REGISTER_TYPE_UD) &&
1098 !has_scalar_region(src0)) {
1099 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
1100 src0.width == BRW_WIDTH_4 &&
1101 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1102
1103 src0.vstride = BRW_VERTICAL_STRIDE_1;
1104 src0.width = BRW_WIDTH_2;
1105 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1106 }
1107
1108 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1109 }
1110
1111 brw_inst *
1112 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1113 struct brw_reg src0, struct brw_reg src1)
1114 {
1115 /* 6.2.2: add */
1116 if (src0.type == BRW_REGISTER_TYPE_F ||
1117 (src0.file == BRW_IMMEDIATE_VALUE &&
1118 src0.type == BRW_REGISTER_TYPE_VF)) {
1119 assert(src1.type != BRW_REGISTER_TYPE_UD);
1120 assert(src1.type != BRW_REGISTER_TYPE_D);
1121 }
1122
1123 if (src1.type == BRW_REGISTER_TYPE_F ||
1124 (src1.file == BRW_IMMEDIATE_VALUE &&
1125 src1.type == BRW_REGISTER_TYPE_VF)) {
1126 assert(src0.type != BRW_REGISTER_TYPE_UD);
1127 assert(src0.type != BRW_REGISTER_TYPE_D);
1128 }
1129
1130 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1131 }
1132
1133 brw_inst *
1134 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1135 struct brw_reg src0, struct brw_reg src1)
1136 {
1137 assert(dest.type == src0.type);
1138 assert(src0.type == src1.type);
1139 switch (src0.type) {
1140 case BRW_REGISTER_TYPE_B:
1141 case BRW_REGISTER_TYPE_UB:
1142 case BRW_REGISTER_TYPE_W:
1143 case BRW_REGISTER_TYPE_UW:
1144 case BRW_REGISTER_TYPE_D:
1145 case BRW_REGISTER_TYPE_UD:
1146 break;
1147 default:
1148 unreachable("Bad type for brw_AVG");
1149 }
1150
1151 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1152 }
1153
1154 brw_inst *
1155 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1156 struct brw_reg src0, struct brw_reg src1)
1157 {
1158 /* 6.32.38: mul */
1159 if (src0.type == BRW_REGISTER_TYPE_D ||
1160 src0.type == BRW_REGISTER_TYPE_UD ||
1161 src1.type == BRW_REGISTER_TYPE_D ||
1162 src1.type == BRW_REGISTER_TYPE_UD) {
1163 assert(dest.type != BRW_REGISTER_TYPE_F);
1164 }
1165
1166 if (src0.type == BRW_REGISTER_TYPE_F ||
1167 (src0.file == BRW_IMMEDIATE_VALUE &&
1168 src0.type == BRW_REGISTER_TYPE_VF)) {
1169 assert(src1.type != BRW_REGISTER_TYPE_UD);
1170 assert(src1.type != BRW_REGISTER_TYPE_D);
1171 }
1172
1173 if (src1.type == BRW_REGISTER_TYPE_F ||
1174 (src1.file == BRW_IMMEDIATE_VALUE &&
1175 src1.type == BRW_REGISTER_TYPE_VF)) {
1176 assert(src0.type != BRW_REGISTER_TYPE_UD);
1177 assert(src0.type != BRW_REGISTER_TYPE_D);
1178 }
1179
1180 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1181 src0.nr != BRW_ARF_ACCUMULATOR);
1182 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1183 src1.nr != BRW_ARF_ACCUMULATOR);
1184
1185 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1186 }
1187
1188 brw_inst *
1189 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1190 struct brw_reg src0, struct brw_reg src1)
1191 {
1192 src0.vstride = BRW_VERTICAL_STRIDE_0;
1193 src0.width = BRW_WIDTH_1;
1194 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1195 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1196 }
1197
1198 brw_inst *
1199 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1200 struct brw_reg src0, struct brw_reg src1)
1201 {
1202 src0.vstride = BRW_VERTICAL_STRIDE_0;
1203 src0.width = BRW_WIDTH_1;
1204 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1205 src1.vstride = BRW_VERTICAL_STRIDE_8;
1206 src1.width = BRW_WIDTH_8;
1207 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1208 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1209 }
1210
1211 brw_inst *
1212 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1213 {
1214 const struct gen_device_info *devinfo = p->devinfo;
1215 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1216 /* The F32TO16 instruction doesn't support 32-bit destination types in
1217 * Align1 mode, and neither does the Gen8 implementation in terms of a
1218 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1219 * an undocumented feature.
1220 */
1221 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1222 (!align16 || devinfo->gen >= 8));
1223 brw_inst *inst;
1224
1225 if (align16) {
1226 assert(dst.type == BRW_REGISTER_TYPE_UD);
1227 } else {
1228 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1229 dst.type == BRW_REGISTER_TYPE_W ||
1230 dst.type == BRW_REGISTER_TYPE_UW ||
1231 dst.type == BRW_REGISTER_TYPE_HF);
1232 }
1233
1234 brw_push_insn_state(p);
1235
1236 if (needs_zero_fill) {
1237 brw_set_default_access_mode(p, BRW_ALIGN_1);
1238 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1239 }
1240
1241 if (devinfo->gen >= 8) {
1242 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1243 } else {
1244 assert(devinfo->gen == 7);
1245 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1246 }
1247
1248 if (needs_zero_fill) {
1249 brw_inst_set_no_dd_clear(devinfo, inst, true);
1250 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1251 brw_inst_set_no_dd_check(devinfo, inst, true);
1252 }
1253
1254 brw_pop_insn_state(p);
1255 return inst;
1256 }
1257
1258 brw_inst *
1259 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1260 {
1261 const struct gen_device_info *devinfo = p->devinfo;
1262 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1263
1264 if (align16) {
1265 assert(src.type == BRW_REGISTER_TYPE_UD);
1266 } else {
1267 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1268 *
1269 * Because this instruction does not have a 16-bit floating-point
1270 * type, the source data type must be Word (W). The destination type
1271 * must be F (Float).
1272 */
1273 if (src.type == BRW_REGISTER_TYPE_UD)
1274 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1275
1276 assert(src.type == BRW_REGISTER_TYPE_W ||
1277 src.type == BRW_REGISTER_TYPE_UW ||
1278 src.type == BRW_REGISTER_TYPE_HF);
1279 }
1280
1281 if (devinfo->gen >= 8) {
1282 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1283 } else {
1284 assert(devinfo->gen == 7);
1285 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1286 }
1287 }
1288
1289
1290 void brw_NOP(struct brw_codegen *p)
1291 {
1292 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1293 memset(insn, 0, sizeof(*insn));
1294 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1295 }
1296
1297
1298
1299
1300
1301 /***********************************************************************
1302 * Comparisons, if/else/endif
1303 */
1304
1305 brw_inst *
1306 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1307 unsigned predicate_control)
1308 {
1309 const struct gen_device_info *devinfo = p->devinfo;
1310 struct brw_reg ip = brw_ip_reg();
1311 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1312
1313 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1314 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1315 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1316 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1317
1318 return inst;
1319 }
1320
1321 static void
1322 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1323 {
1324 p->if_stack[p->if_stack_depth] = inst - p->store;
1325
1326 p->if_stack_depth++;
1327 if (p->if_stack_array_size <= p->if_stack_depth) {
1328 p->if_stack_array_size *= 2;
1329 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1330 p->if_stack_array_size);
1331 }
1332 }
1333
1334 static brw_inst *
1335 pop_if_stack(struct brw_codegen *p)
1336 {
1337 p->if_stack_depth--;
1338 return &p->store[p->if_stack[p->if_stack_depth]];
1339 }
1340
1341 static void
1342 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1343 {
1344 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1345 p->loop_stack_array_size *= 2;
1346 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1347 p->loop_stack_array_size);
1348 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1349 p->loop_stack_array_size);
1350 }
1351
1352 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1353 p->loop_stack_depth++;
1354 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1355 }
1356
1357 static brw_inst *
1358 get_inner_do_insn(struct brw_codegen *p)
1359 {
1360 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1361 }
1362
1363 /* EU takes the value from the flag register and pushes it onto some
1364 * sort of a stack (presumably merging with any flag value already on
1365 * the stack). Within an if block, the flags at the top of the stack
1366 * control execution on each channel of the unit, eg. on each of the
1367 * 16 pixel values in our wm programs.
1368 *
1369 * When the matching 'else' instruction is reached (presumably by
1370 * countdown of the instruction count patched in by our ELSE/ENDIF
1371 * functions), the relevant flags are inverted.
1372 *
1373 * When the matching 'endif' instruction is reached, the flags are
1374 * popped off. If the stack is now empty, normal execution resumes.
1375 */
1376 brw_inst *
1377 brw_IF(struct brw_codegen *p, unsigned execute_size)
1378 {
1379 const struct gen_device_info *devinfo = p->devinfo;
1380 brw_inst *insn;
1381
1382 insn = next_insn(p, BRW_OPCODE_IF);
1383
1384 /* Override the defaults for this instruction:
1385 */
1386 if (devinfo->gen < 6) {
1387 brw_set_dest(p, insn, brw_ip_reg());
1388 brw_set_src0(p, insn, brw_ip_reg());
1389 brw_set_src1(p, insn, brw_imm_d(0x0));
1390 } else if (devinfo->gen == 6) {
1391 brw_set_dest(p, insn, brw_imm_w(0));
1392 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1393 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1394 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1395 } else if (devinfo->gen == 7) {
1396 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1397 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1398 brw_set_src1(p, insn, brw_imm_w(0));
1399 brw_inst_set_jip(devinfo, insn, 0);
1400 brw_inst_set_uip(devinfo, insn, 0);
1401 } else {
1402 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1403 brw_set_src0(p, insn, brw_imm_d(0));
1404 brw_inst_set_jip(devinfo, insn, 0);
1405 brw_inst_set_uip(devinfo, insn, 0);
1406 }
1407
1408 brw_inst_set_exec_size(devinfo, insn, execute_size);
1409 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1410 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1411 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1412 if (!p->single_program_flow && devinfo->gen < 6)
1413 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1414
1415 push_if_stack(p, insn);
1416 p->if_depth_in_loop[p->loop_stack_depth]++;
1417 return insn;
1418 }
1419
1420 /* This function is only used for gen6-style IF instructions with an
1421 * embedded comparison (conditional modifier). It is not used on gen7.
1422 */
1423 brw_inst *
1424 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1425 struct brw_reg src0, struct brw_reg src1)
1426 {
1427 const struct gen_device_info *devinfo = p->devinfo;
1428 brw_inst *insn;
1429
1430 insn = next_insn(p, BRW_OPCODE_IF);
1431
1432 brw_set_dest(p, insn, brw_imm_w(0));
1433 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1434 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1435 brw_set_src0(p, insn, src0);
1436 brw_set_src1(p, insn, src1);
1437
1438 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1439 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1440 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1441
1442 push_if_stack(p, insn);
1443 return insn;
1444 }
1445
1446 /**
1447 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1448 */
1449 static void
1450 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1451 brw_inst *if_inst, brw_inst *else_inst)
1452 {
1453 const struct gen_device_info *devinfo = p->devinfo;
1454
1455 /* The next instruction (where the ENDIF would be, if it existed) */
1456 brw_inst *next_inst = &p->store[p->nr_insn];
1457
1458 assert(p->single_program_flow);
1459 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1460 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1461 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1462
1463 /* Convert IF to an ADD instruction that moves the instruction pointer
1464 * to the first instruction of the ELSE block. If there is no ELSE
1465 * block, point to where ENDIF would be. Reverse the predicate.
1466 *
1467 * There's no need to execute an ENDIF since we don't need to do any
1468 * stack operations, and if we're currently executing, we just want to
1469 * continue normally.
1470 */
1471 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1472 brw_inst_set_pred_inv(devinfo, if_inst, true);
1473
1474 if (else_inst != NULL) {
1475 /* Convert ELSE to an ADD instruction that points where the ENDIF
1476 * would be.
1477 */
1478 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1479
1480 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1481 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1482 } else {
1483 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1484 }
1485 }
1486
1487 /**
1488 * Patch IF and ELSE instructions with appropriate jump targets.
1489 */
1490 static void
1491 patch_IF_ELSE(struct brw_codegen *p,
1492 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1493 {
1494 const struct gen_device_info *devinfo = p->devinfo;
1495
1496 /* We shouldn't be patching IF and ELSE instructions in single program flow
1497 * mode when gen < 6, because in single program flow mode on those
1498 * platforms, we convert flow control instructions to conditional ADDs that
1499 * operate on IP (see brw_ENDIF).
1500 *
1501 * However, on Gen6, writing to IP doesn't work in single program flow mode
1502 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1503 * not be updated by non-flow control instructions."). And on later
1504 * platforms, there is no significant benefit to converting control flow
1505 * instructions to conditional ADDs. So we do patch IF and ELSE
1506 * instructions in single program flow mode on those platforms.
1507 */
1508 if (devinfo->gen < 6)
1509 assert(!p->single_program_flow);
1510
1511 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1512 assert(endif_inst != NULL);
1513 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1514
1515 unsigned br = brw_jump_scale(devinfo);
1516
1517 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1518 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1519
1520 if (else_inst == NULL) {
1521 /* Patch IF -> ENDIF */
1522 if (devinfo->gen < 6) {
1523 /* Turn it into an IFF, which means no mask stack operations for
1524 * all-false and jumping past the ENDIF.
1525 */
1526 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1527 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1528 br * (endif_inst - if_inst + 1));
1529 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1530 } else if (devinfo->gen == 6) {
1531 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1532 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1533 } else {
1534 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1535 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1536 }
1537 } else {
1538 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1539
1540 /* Patch IF -> ELSE */
1541 if (devinfo->gen < 6) {
1542 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1543 br * (else_inst - if_inst));
1544 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1545 } else if (devinfo->gen == 6) {
1546 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1547 br * (else_inst - if_inst + 1));
1548 }
1549
1550 /* Patch ELSE -> ENDIF */
1551 if (devinfo->gen < 6) {
1552 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1553 * matching ENDIF.
1554 */
1555 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1556 br * (endif_inst - else_inst + 1));
1557 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1558 } else if (devinfo->gen == 6) {
1559 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1560 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1561 br * (endif_inst - else_inst));
1562 } else {
1563 /* The IF instruction's JIP should point just past the ELSE */
1564 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1565 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1566 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1567 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1568 if (devinfo->gen >= 8) {
1569 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1570 * should point to ENDIF.
1571 */
1572 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1573 }
1574 }
1575 }
1576 }
1577
1578 void
1579 brw_ELSE(struct brw_codegen *p)
1580 {
1581 const struct gen_device_info *devinfo = p->devinfo;
1582 brw_inst *insn;
1583
1584 insn = next_insn(p, BRW_OPCODE_ELSE);
1585
1586 if (devinfo->gen < 6) {
1587 brw_set_dest(p, insn, brw_ip_reg());
1588 brw_set_src0(p, insn, brw_ip_reg());
1589 brw_set_src1(p, insn, brw_imm_d(0x0));
1590 } else if (devinfo->gen == 6) {
1591 brw_set_dest(p, insn, brw_imm_w(0));
1592 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1593 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1594 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1595 } else if (devinfo->gen == 7) {
1596 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1597 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1598 brw_set_src1(p, insn, brw_imm_w(0));
1599 brw_inst_set_jip(devinfo, insn, 0);
1600 brw_inst_set_uip(devinfo, insn, 0);
1601 } else {
1602 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1603 brw_set_src0(p, insn, brw_imm_d(0));
1604 brw_inst_set_jip(devinfo, insn, 0);
1605 brw_inst_set_uip(devinfo, insn, 0);
1606 }
1607
1608 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1609 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1610 if (!p->single_program_flow && devinfo->gen < 6)
1611 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1612
1613 push_if_stack(p, insn);
1614 }
1615
1616 void
1617 brw_ENDIF(struct brw_codegen *p)
1618 {
1619 const struct gen_device_info *devinfo = p->devinfo;
1620 brw_inst *insn = NULL;
1621 brw_inst *else_inst = NULL;
1622 brw_inst *if_inst = NULL;
1623 brw_inst *tmp;
1624 bool emit_endif = true;
1625
1626 /* In single program flow mode, we can express IF and ELSE instructions
1627 * equivalently as ADD instructions that operate on IP. On platforms prior
1628 * to Gen6, flow control instructions cause an implied thread switch, so
1629 * this is a significant savings.
1630 *
1631 * However, on Gen6, writing to IP doesn't work in single program flow mode
1632 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1633 * not be updated by non-flow control instructions."). And on later
1634 * platforms, there is no significant benefit to converting control flow
1635 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1636 * Gen5.
1637 */
1638 if (devinfo->gen < 6 && p->single_program_flow)
1639 emit_endif = false;
1640
1641 /*
1642 * A single next_insn() may change the base address of instruction store
1643 * memory(p->store), so call it first before referencing the instruction
1644 * store pointer from an index
1645 */
1646 if (emit_endif)
1647 insn = next_insn(p, BRW_OPCODE_ENDIF);
1648
1649 /* Pop the IF and (optional) ELSE instructions from the stack */
1650 p->if_depth_in_loop[p->loop_stack_depth]--;
1651 tmp = pop_if_stack(p);
1652 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1653 else_inst = tmp;
1654 tmp = pop_if_stack(p);
1655 }
1656 if_inst = tmp;
1657
1658 if (!emit_endif) {
1659 /* ENDIF is useless; don't bother emitting it. */
1660 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1661 return;
1662 }
1663
1664 if (devinfo->gen < 6) {
1665 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1666 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1667 brw_set_src1(p, insn, brw_imm_d(0x0));
1668 } else if (devinfo->gen == 6) {
1669 brw_set_dest(p, insn, brw_imm_w(0));
1670 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1671 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1672 } else if (devinfo->gen == 7) {
1673 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1674 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1675 brw_set_src1(p, insn, brw_imm_w(0));
1676 } else {
1677 brw_set_src0(p, insn, brw_imm_d(0));
1678 }
1679
1680 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1681 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1682 if (devinfo->gen < 6)
1683 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1684
1685 /* Also pop item off the stack in the endif instruction: */
1686 if (devinfo->gen < 6) {
1687 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1688 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1689 } else if (devinfo->gen == 6) {
1690 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1691 } else {
1692 brw_inst_set_jip(devinfo, insn, 2);
1693 }
1694 patch_IF_ELSE(p, if_inst, else_inst, insn);
1695 }
1696
1697 brw_inst *
1698 brw_BREAK(struct brw_codegen *p)
1699 {
1700 const struct gen_device_info *devinfo = p->devinfo;
1701 brw_inst *insn;
1702
1703 insn = next_insn(p, BRW_OPCODE_BREAK);
1704 if (devinfo->gen >= 8) {
1705 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1706 brw_set_src0(p, insn, brw_imm_d(0x0));
1707 } else if (devinfo->gen >= 6) {
1708 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1709 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1710 brw_set_src1(p, insn, brw_imm_d(0x0));
1711 } else {
1712 brw_set_dest(p, insn, brw_ip_reg());
1713 brw_set_src0(p, insn, brw_ip_reg());
1714 brw_set_src1(p, insn, brw_imm_d(0x0));
1715 brw_inst_set_gen4_pop_count(devinfo, insn,
1716 p->if_depth_in_loop[p->loop_stack_depth]);
1717 }
1718 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1719 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1720
1721 return insn;
1722 }
1723
1724 brw_inst *
1725 brw_CONT(struct brw_codegen *p)
1726 {
1727 const struct gen_device_info *devinfo = p->devinfo;
1728 brw_inst *insn;
1729
1730 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1731 brw_set_dest(p, insn, brw_ip_reg());
1732 if (devinfo->gen >= 8) {
1733 brw_set_src0(p, insn, brw_imm_d(0x0));
1734 } else {
1735 brw_set_src0(p, insn, brw_ip_reg());
1736 brw_set_src1(p, insn, brw_imm_d(0x0));
1737 }
1738
1739 if (devinfo->gen < 6) {
1740 brw_inst_set_gen4_pop_count(devinfo, insn,
1741 p->if_depth_in_loop[p->loop_stack_depth]);
1742 }
1743 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1744 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1745 return insn;
1746 }
1747
1748 brw_inst *
1749 gen6_HALT(struct brw_codegen *p)
1750 {
1751 const struct gen_device_info *devinfo = p->devinfo;
1752 brw_inst *insn;
1753
1754 insn = next_insn(p, BRW_OPCODE_HALT);
1755 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1756 if (devinfo->gen >= 8) {
1757 brw_set_src0(p, insn, brw_imm_d(0x0));
1758 } else {
1759 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1760 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1761 }
1762
1763 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1764 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1765 return insn;
1766 }
1767
1768 /* DO/WHILE loop:
1769 *
1770 * The DO/WHILE is just an unterminated loop -- break or continue are
1771 * used for control within the loop. We have a few ways they can be
1772 * done.
1773 *
1774 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1775 * jip and no DO instruction.
1776 *
1777 * For non-uniform control flow pre-gen6, there's a DO instruction to
1778 * push the mask, and a WHILE to jump back, and BREAK to get out and
1779 * pop the mask.
1780 *
1781 * For gen6, there's no more mask stack, so no need for DO. WHILE
1782 * just points back to the first instruction of the loop.
1783 */
1784 brw_inst *
1785 brw_DO(struct brw_codegen *p, unsigned execute_size)
1786 {
1787 const struct gen_device_info *devinfo = p->devinfo;
1788
1789 if (devinfo->gen >= 6 || p->single_program_flow) {
1790 push_loop_stack(p, &p->store[p->nr_insn]);
1791 return &p->store[p->nr_insn];
1792 } else {
1793 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1794
1795 push_loop_stack(p, insn);
1796
1797 /* Override the defaults for this instruction:
1798 */
1799 brw_set_dest(p, insn, brw_null_reg());
1800 brw_set_src0(p, insn, brw_null_reg());
1801 brw_set_src1(p, insn, brw_null_reg());
1802
1803 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1804 brw_inst_set_exec_size(devinfo, insn, execute_size);
1805 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1806
1807 return insn;
1808 }
1809 }
1810
1811 /**
1812 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1813 * instruction here.
1814 *
1815 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1816 * nesting, since it can always just point to the end of the block/current loop.
1817 */
1818 static void
1819 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1820 {
1821 const struct gen_device_info *devinfo = p->devinfo;
1822 brw_inst *do_inst = get_inner_do_insn(p);
1823 brw_inst *inst;
1824 unsigned br = brw_jump_scale(devinfo);
1825
1826 assert(devinfo->gen < 6);
1827
1828 for (inst = while_inst - 1; inst != do_inst; inst--) {
1829 /* If the jump count is != 0, that means that this instruction has already
1830 * been patched because it's part of a loop inside of the one we're
1831 * patching.
1832 */
1833 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1834 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1835 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1836 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1837 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1838 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1839 }
1840 }
1841 }
1842
1843 brw_inst *
1844 brw_WHILE(struct brw_codegen *p)
1845 {
1846 const struct gen_device_info *devinfo = p->devinfo;
1847 brw_inst *insn, *do_insn;
1848 unsigned br = brw_jump_scale(devinfo);
1849
1850 if (devinfo->gen >= 6) {
1851 insn = next_insn(p, BRW_OPCODE_WHILE);
1852 do_insn = get_inner_do_insn(p);
1853
1854 if (devinfo->gen >= 8) {
1855 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1856 brw_set_src0(p, insn, brw_imm_d(0));
1857 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1858 } else if (devinfo->gen == 7) {
1859 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1860 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1861 brw_set_src1(p, insn, brw_imm_w(0));
1862 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1863 } else {
1864 brw_set_dest(p, insn, brw_imm_w(0));
1865 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1866 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1867 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1868 }
1869
1870 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1871
1872 } else {
1873 if (p->single_program_flow) {
1874 insn = next_insn(p, BRW_OPCODE_ADD);
1875 do_insn = get_inner_do_insn(p);
1876
1877 brw_set_dest(p, insn, brw_ip_reg());
1878 brw_set_src0(p, insn, brw_ip_reg());
1879 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1880 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1881 } else {
1882 insn = next_insn(p, BRW_OPCODE_WHILE);
1883 do_insn = get_inner_do_insn(p);
1884
1885 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1886
1887 brw_set_dest(p, insn, brw_ip_reg());
1888 brw_set_src0(p, insn, brw_ip_reg());
1889 brw_set_src1(p, insn, brw_imm_d(0));
1890
1891 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1892 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1893 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1894
1895 brw_patch_break_cont(p, insn);
1896 }
1897 }
1898 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1899
1900 p->loop_stack_depth--;
1901
1902 return insn;
1903 }
1904
1905 /* FORWARD JUMPS:
1906 */
1907 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1908 {
1909 const struct gen_device_info *devinfo = p->devinfo;
1910 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1911 unsigned jmpi = 1;
1912
1913 if (devinfo->gen >= 5)
1914 jmpi = 2;
1915
1916 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1917 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1918
1919 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1920 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1921 }
1922
1923 /* To integrate with the above, it makes sense that the comparison
1924 * instruction should populate the flag register. It might be simpler
1925 * just to use the flag reg for most WM tasks?
1926 */
1927 void brw_CMP(struct brw_codegen *p,
1928 struct brw_reg dest,
1929 unsigned conditional,
1930 struct brw_reg src0,
1931 struct brw_reg src1)
1932 {
1933 const struct gen_device_info *devinfo = p->devinfo;
1934 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1935
1936 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1937 brw_set_dest(p, insn, dest);
1938 brw_set_src0(p, insn, src0);
1939 brw_set_src1(p, insn, src1);
1940
1941 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1942 * page says:
1943 * "Any CMP instruction with a null destination must use a {switch}."
1944 *
1945 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1946 * mentioned on their work-arounds pages.
1947 */
1948 if (devinfo->gen == 7) {
1949 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1950 dest.nr == BRW_ARF_NULL) {
1951 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1952 }
1953 }
1954 }
1955
1956 /***********************************************************************
1957 * Helpers for the various SEND message types:
1958 */
1959
1960 /** Extended math function, float[8].
1961 */
1962 void gen4_math(struct brw_codegen *p,
1963 struct brw_reg dest,
1964 unsigned function,
1965 unsigned msg_reg_nr,
1966 struct brw_reg src,
1967 unsigned precision )
1968 {
1969 const struct gen_device_info *devinfo = p->devinfo;
1970 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1971 unsigned data_type;
1972 if (has_scalar_region(src)) {
1973 data_type = BRW_MATH_DATA_SCALAR;
1974 } else {
1975 data_type = BRW_MATH_DATA_VECTOR;
1976 }
1977
1978 assert(devinfo->gen < 6);
1979
1980 /* Example code doesn't set predicate_control for send
1981 * instructions.
1982 */
1983 brw_inst_set_pred_control(devinfo, insn, 0);
1984 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1985
1986 brw_set_dest(p, insn, dest);
1987 brw_set_src0(p, insn, src);
1988 brw_set_math_message(p,
1989 insn,
1990 function,
1991 src.type == BRW_REGISTER_TYPE_D,
1992 precision,
1993 data_type);
1994 }
1995
1996 void gen6_math(struct brw_codegen *p,
1997 struct brw_reg dest,
1998 unsigned function,
1999 struct brw_reg src0,
2000 struct brw_reg src1)
2001 {
2002 const struct gen_device_info *devinfo = p->devinfo;
2003 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2004
2005 assert(devinfo->gen >= 6);
2006
2007 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2008 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2009
2010 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2011 if (devinfo->gen == 6) {
2012 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2013 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2014 }
2015
2016 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2017 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2018 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2019 assert(src0.type != BRW_REGISTER_TYPE_F);
2020 assert(src1.type != BRW_REGISTER_TYPE_F);
2021 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2022 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2023 } else {
2024 assert(src0.type == BRW_REGISTER_TYPE_F);
2025 assert(src1.type == BRW_REGISTER_TYPE_F);
2026 }
2027
2028 /* Source modifiers are ignored for extended math instructions on Gen6. */
2029 if (devinfo->gen == 6) {
2030 assert(!src0.negate);
2031 assert(!src0.abs);
2032 assert(!src1.negate);
2033 assert(!src1.abs);
2034 }
2035
2036 brw_inst_set_math_function(devinfo, insn, function);
2037
2038 brw_set_dest(p, insn, dest);
2039 brw_set_src0(p, insn, src0);
2040 brw_set_src1(p, insn, src1);
2041 }
2042
2043 /**
2044 * Return the right surface index to access the thread scratch space using
2045 * stateless dataport messages.
2046 */
2047 unsigned
2048 brw_scratch_surface_idx(const struct brw_codegen *p)
2049 {
2050 /* The scratch space is thread-local so IA coherency is unnecessary. */
2051 if (p->devinfo->gen >= 8)
2052 return GEN8_BTI_STATELESS_NON_COHERENT;
2053 else
2054 return BRW_BTI_STATELESS;
2055 }
2056
2057 /**
2058 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2059 * using a constant offset per channel.
2060 *
2061 * The offset must be aligned to oword size (16 bytes). Used for
2062 * register spilling.
2063 */
2064 void brw_oword_block_write_scratch(struct brw_codegen *p,
2065 struct brw_reg mrf,
2066 int num_regs,
2067 unsigned offset)
2068 {
2069 const struct gen_device_info *devinfo = p->devinfo;
2070 const unsigned target_cache =
2071 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2072 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2073 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2074 uint32_t msg_type;
2075
2076 if (devinfo->gen >= 6)
2077 offset /= 16;
2078
2079 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2080
2081 const unsigned mlen = 1 + num_regs;
2082
2083 /* Set up the message header. This is g0, with g0.2 filled with
2084 * the offset. We don't want to leave our offset around in g0 or
2085 * it'll screw up texture samples, so set it up inside the message
2086 * reg.
2087 */
2088 {
2089 brw_push_insn_state(p);
2090 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2091 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2092 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2093
2094 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2095
2096 /* set message header global offset field (reg 0, element 2) */
2097 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2098 brw_MOV(p,
2099 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2100 mrf.nr,
2101 2), BRW_REGISTER_TYPE_UD),
2102 brw_imm_ud(offset));
2103
2104 brw_pop_insn_state(p);
2105 }
2106
2107 {
2108 struct brw_reg dest;
2109 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2110 int send_commit_msg;
2111 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2112 BRW_REGISTER_TYPE_UW);
2113
2114 brw_inst_set_compression(devinfo, insn, false);
2115
2116 if (brw_inst_exec_size(devinfo, insn) >= 16)
2117 src_header = vec16(src_header);
2118
2119 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2120 if (devinfo->gen < 6)
2121 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2122
2123 /* Until gen6, writes followed by reads from the same location
2124 * are not guaranteed to be ordered unless write_commit is set.
2125 * If set, then a no-op write is issued to the destination
2126 * register to set a dependency, and a read from the destination
2127 * can be used to ensure the ordering.
2128 *
2129 * For gen6, only writes between different threads need ordering
2130 * protection. Our use of DP writes is all about register
2131 * spilling within a thread.
2132 */
2133 if (devinfo->gen >= 6) {
2134 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2135 send_commit_msg = 0;
2136 } else {
2137 dest = src_header;
2138 send_commit_msg = 1;
2139 }
2140
2141 brw_set_dest(p, insn, dest);
2142 if (devinfo->gen >= 6) {
2143 brw_set_src0(p, insn, mrf);
2144 } else {
2145 brw_set_src0(p, insn, brw_null_reg());
2146 }
2147
2148 if (devinfo->gen >= 6)
2149 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2150 else
2151 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2152
2153 brw_set_dp_write_message(p,
2154 insn,
2155 brw_scratch_surface_idx(p),
2156 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2157 msg_type,
2158 target_cache,
2159 mlen,
2160 true, /* header_present */
2161 0, /* not a render target */
2162 send_commit_msg, /* response_length */
2163 0, /* eot */
2164 send_commit_msg);
2165 }
2166 }
2167
2168
2169 /**
2170 * Read a block of owords (half a GRF each) from the scratch buffer
2171 * using a constant index per channel.
2172 *
2173 * Offset must be aligned to oword size (16 bytes). Used for register
2174 * spilling.
2175 */
2176 void
2177 brw_oword_block_read_scratch(struct brw_codegen *p,
2178 struct brw_reg dest,
2179 struct brw_reg mrf,
2180 int num_regs,
2181 unsigned offset)
2182 {
2183 const struct gen_device_info *devinfo = p->devinfo;
2184
2185 if (devinfo->gen >= 6)
2186 offset /= 16;
2187
2188 if (p->devinfo->gen >= 7) {
2189 /* On gen 7 and above, we no longer have message registers and we can
2190 * send from any register we want. By using the destination register
2191 * for the message, we guarantee that the implied message write won't
2192 * accidentally overwrite anything. This has been a problem because
2193 * the MRF registers and source for the final FB write are both fixed
2194 * and may overlap.
2195 */
2196 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2197 } else {
2198 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2199 }
2200 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2201
2202 const unsigned rlen = num_regs;
2203 const unsigned target_cache =
2204 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2205 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2206 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2207
2208 {
2209 brw_push_insn_state(p);
2210 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2211 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2212 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2213
2214 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2215
2216 /* set message header global offset field (reg 0, element 2) */
2217 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2218 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2219
2220 brw_pop_insn_state(p);
2221 }
2222
2223 {
2224 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2225
2226 assert(brw_inst_pred_control(devinfo, insn) == 0);
2227 brw_inst_set_compression(devinfo, insn, false);
2228
2229 brw_set_dest(p, insn, dest); /* UW? */
2230 if (devinfo->gen >= 6) {
2231 brw_set_src0(p, insn, mrf);
2232 } else {
2233 brw_set_src0(p, insn, brw_null_reg());
2234 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2235 }
2236
2237 brw_set_dp_read_message(p,
2238 insn,
2239 brw_scratch_surface_idx(p),
2240 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2241 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2242 target_cache,
2243 1, /* msg_length */
2244 true, /* header_present */
2245 rlen);
2246 }
2247 }
2248
2249 void
2250 gen7_block_read_scratch(struct brw_codegen *p,
2251 struct brw_reg dest,
2252 int num_regs,
2253 unsigned offset)
2254 {
2255 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2256 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2257
2258 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2259
2260 /* The HW requires that the header is present; this is to get the g0.5
2261 * scratch offset.
2262 */
2263 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2264
2265 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2266 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2267 * is 32 bytes, which happens to be the size of a register.
2268 */
2269 offset /= REG_SIZE;
2270 assert(offset < (1 << 12));
2271
2272 gen7_set_dp_scratch_message(p, insn,
2273 false, /* scratch read */
2274 false, /* OWords */
2275 false, /* invalidate after read */
2276 num_regs,
2277 offset,
2278 1, /* mlen: just g0 */
2279 num_regs, /* rlen */
2280 true); /* header present */
2281 }
2282
2283 /**
2284 * Read float[4] vectors from the data port constant cache.
2285 * Location (in buffer) should be a multiple of 16.
2286 * Used for fetching shader constants.
2287 */
2288 void brw_oword_block_read(struct brw_codegen *p,
2289 struct brw_reg dest,
2290 struct brw_reg mrf,
2291 uint32_t offset,
2292 uint32_t bind_table_index)
2293 {
2294 const struct gen_device_info *devinfo = p->devinfo;
2295 const unsigned target_cache =
2296 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2297 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2298 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2299
2300 /* On newer hardware, offset is in units of owords. */
2301 if (devinfo->gen >= 6)
2302 offset /= 16;
2303
2304 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2305
2306 brw_push_insn_state(p);
2307 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2308 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2309 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2310
2311 brw_push_insn_state(p);
2312 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2313 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2314
2315 /* set message header global offset field (reg 0, element 2) */
2316 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2317 brw_MOV(p,
2318 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2319 mrf.nr,
2320 2), BRW_REGISTER_TYPE_UD),
2321 brw_imm_ud(offset));
2322 brw_pop_insn_state(p);
2323
2324 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2325
2326 /* cast dest to a uword[8] vector */
2327 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2328
2329 brw_set_dest(p, insn, dest);
2330 if (devinfo->gen >= 6) {
2331 brw_set_src0(p, insn, mrf);
2332 } else {
2333 brw_set_src0(p, insn, brw_null_reg());
2334 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2335 }
2336
2337 brw_set_dp_read_message(p, insn, bind_table_index,
2338 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2339 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2340 target_cache,
2341 1, /* msg_length */
2342 true, /* header_present */
2343 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2344
2345 brw_pop_insn_state(p);
2346 }
2347
2348
2349 void brw_fb_WRITE(struct brw_codegen *p,
2350 struct brw_reg payload,
2351 struct brw_reg implied_header,
2352 unsigned msg_control,
2353 unsigned binding_table_index,
2354 unsigned msg_length,
2355 unsigned response_length,
2356 bool eot,
2357 bool last_render_target,
2358 bool header_present)
2359 {
2360 const struct gen_device_info *devinfo = p->devinfo;
2361 const unsigned target_cache =
2362 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2363 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2364 brw_inst *insn;
2365 unsigned msg_type;
2366 struct brw_reg dest, src0;
2367
2368 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2369 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2370 else
2371 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2372
2373 if (devinfo->gen >= 6) {
2374 insn = next_insn(p, BRW_OPCODE_SENDC);
2375 } else {
2376 insn = next_insn(p, BRW_OPCODE_SEND);
2377 }
2378 brw_inst_set_compression(devinfo, insn, false);
2379
2380 if (devinfo->gen >= 6) {
2381 /* headerless version, just submit color payload */
2382 src0 = payload;
2383
2384 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2385 } else {
2386 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2387 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2388 src0 = implied_header;
2389
2390 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2391 }
2392
2393 brw_set_dest(p, insn, dest);
2394 brw_set_src0(p, insn, src0);
2395 brw_set_dp_write_message(p,
2396 insn,
2397 binding_table_index,
2398 msg_control,
2399 msg_type,
2400 target_cache,
2401 msg_length,
2402 header_present,
2403 last_render_target,
2404 response_length,
2405 eot,
2406 0 /* send_commit_msg */);
2407 }
2408
2409 brw_inst *
2410 gen9_fb_READ(struct brw_codegen *p,
2411 struct brw_reg dst,
2412 struct brw_reg payload,
2413 unsigned binding_table_index,
2414 unsigned msg_length,
2415 unsigned response_length,
2416 bool per_sample)
2417 {
2418 const struct gen_device_info *devinfo = p->devinfo;
2419 assert(devinfo->gen >= 9);
2420 const unsigned msg_subtype =
2421 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2422 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2423
2424 brw_set_dest(p, insn, dst);
2425 brw_set_src0(p, insn, payload);
2426 brw_set_dp_read_message(p, insn, binding_table_index,
2427 per_sample << 5 | msg_subtype,
2428 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2429 GEN6_SFID_DATAPORT_RENDER_CACHE,
2430 msg_length, true /* header_present */,
2431 response_length);
2432 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2433
2434 return insn;
2435 }
2436
2437 /**
2438 * Texture sample instruction.
2439 * Note: the msg_type plus msg_length values determine exactly what kind
2440 * of sampling operation is performed. See volume 4, page 161 of docs.
2441 */
2442 void brw_SAMPLE(struct brw_codegen *p,
2443 struct brw_reg dest,
2444 unsigned msg_reg_nr,
2445 struct brw_reg src0,
2446 unsigned binding_table_index,
2447 unsigned sampler,
2448 unsigned msg_type,
2449 unsigned response_length,
2450 unsigned msg_length,
2451 unsigned header_present,
2452 unsigned simd_mode,
2453 unsigned return_format)
2454 {
2455 const struct gen_device_info *devinfo = p->devinfo;
2456 brw_inst *insn;
2457
2458 if (msg_reg_nr != -1)
2459 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2460
2461 insn = next_insn(p, BRW_OPCODE_SEND);
2462 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2463
2464 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2465 *
2466 * "Instruction compression is not allowed for this instruction (that
2467 * is, send). The hardware behavior is undefined if this instruction is
2468 * set as compressed. However, compress control can be set to "SecHalf"
2469 * to affect the EMask generation."
2470 *
2471 * No similar wording is found in later PRMs, but there are examples
2472 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2473 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2474 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2475 */
2476 brw_inst_set_compression(devinfo, insn, false);
2477
2478 if (devinfo->gen < 6)
2479 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2480
2481 brw_set_dest(p, insn, dest);
2482 brw_set_src0(p, insn, src0);
2483 brw_set_sampler_message(p, insn,
2484 binding_table_index,
2485 sampler,
2486 msg_type,
2487 response_length,
2488 msg_length,
2489 header_present,
2490 simd_mode,
2491 return_format);
2492 }
2493
2494 /* Adjust the message header's sampler state pointer to
2495 * select the correct group of 16 samplers.
2496 */
2497 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2498 struct brw_reg header,
2499 struct brw_reg sampler_index)
2500 {
2501 /* The "Sampler Index" field can only store values between 0 and 15.
2502 * However, we can add an offset to the "Sampler State Pointer"
2503 * field, effectively selecting a different set of 16 samplers.
2504 *
2505 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2506 * offset, and each sampler state is only 16-bytes, so we can't
2507 * exclusively use the offset - we have to use both.
2508 */
2509
2510 const struct gen_device_info *devinfo = p->devinfo;
2511
2512 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2513 const int sampler_state_size = 16; /* 16 bytes */
2514 uint32_t sampler = sampler_index.ud;
2515
2516 if (sampler >= 16) {
2517 assert(devinfo->is_haswell || devinfo->gen >= 8);
2518 brw_ADD(p,
2519 get_element_ud(header, 3),
2520 get_element_ud(brw_vec8_grf(0, 0), 3),
2521 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2522 }
2523 } else {
2524 /* Non-const sampler array indexing case */
2525 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2526 return;
2527 }
2528
2529 struct brw_reg temp = get_element_ud(header, 3);
2530
2531 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2532 brw_SHL(p, temp, temp, brw_imm_ud(4));
2533 brw_ADD(p,
2534 get_element_ud(header, 3),
2535 get_element_ud(brw_vec8_grf(0, 0), 3),
2536 temp);
2537 }
2538 }
2539
2540 /* All these variables are pretty confusing - we might be better off
2541 * using bitmasks and macros for this, in the old style. Or perhaps
2542 * just having the caller instantiate the fields in dword3 itself.
2543 */
2544 void brw_urb_WRITE(struct brw_codegen *p,
2545 struct brw_reg dest,
2546 unsigned msg_reg_nr,
2547 struct brw_reg src0,
2548 enum brw_urb_write_flags flags,
2549 unsigned msg_length,
2550 unsigned response_length,
2551 unsigned offset,
2552 unsigned swizzle)
2553 {
2554 const struct gen_device_info *devinfo = p->devinfo;
2555 brw_inst *insn;
2556
2557 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2558
2559 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2560 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2561 brw_push_insn_state(p);
2562 brw_set_default_access_mode(p, BRW_ALIGN_1);
2563 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2564 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2565 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2566 BRW_REGISTER_TYPE_UD),
2567 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2568 brw_imm_ud(0xff00));
2569 brw_pop_insn_state(p);
2570 }
2571
2572 insn = next_insn(p, BRW_OPCODE_SEND);
2573
2574 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2575
2576 brw_set_dest(p, insn, dest);
2577 brw_set_src0(p, insn, src0);
2578 brw_set_src1(p, insn, brw_imm_d(0));
2579
2580 if (devinfo->gen < 6)
2581 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2582
2583 brw_set_urb_message(p,
2584 insn,
2585 flags,
2586 msg_length,
2587 response_length,
2588 offset,
2589 swizzle);
2590 }
2591
2592 struct brw_inst *
2593 brw_send_indirect_message(struct brw_codegen *p,
2594 unsigned sfid,
2595 struct brw_reg dst,
2596 struct brw_reg payload,
2597 struct brw_reg desc)
2598 {
2599 const struct gen_device_info *devinfo = p->devinfo;
2600 struct brw_inst *send;
2601 int setup;
2602
2603 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2604
2605 assert(desc.type == BRW_REGISTER_TYPE_UD);
2606
2607 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2608 * in the indirect case) by its index in the instruction store. The
2609 * pointer returned by next_insn() may become invalid if emitting the SEND
2610 * in the indirect case reallocs the store.
2611 */
2612
2613 if (desc.file == BRW_IMMEDIATE_VALUE) {
2614 setup = p->nr_insn;
2615 send = next_insn(p, BRW_OPCODE_SEND);
2616 brw_set_src1(p, send, desc);
2617
2618 } else {
2619 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2620
2621 brw_push_insn_state(p);
2622 brw_set_default_access_mode(p, BRW_ALIGN_1);
2623 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2624 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2625 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2626
2627 /* Load the indirect descriptor to an address register using OR so the
2628 * caller can specify additional descriptor bits with the usual
2629 * brw_set_*_message() helper functions.
2630 */
2631 setup = p->nr_insn;
2632 brw_OR(p, addr, desc, brw_imm_ud(0));
2633
2634 brw_pop_insn_state(p);
2635
2636 send = next_insn(p, BRW_OPCODE_SEND);
2637 brw_set_src1(p, send, addr);
2638 }
2639
2640 if (dst.width < BRW_EXECUTE_8)
2641 brw_inst_set_exec_size(devinfo, send, dst.width);
2642
2643 brw_set_dest(p, send, dst);
2644 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2645 brw_inst_set_sfid(devinfo, send, sfid);
2646
2647 return &p->store[setup];
2648 }
2649
2650 static struct brw_inst *
2651 brw_send_indirect_surface_message(struct brw_codegen *p,
2652 unsigned sfid,
2653 struct brw_reg dst,
2654 struct brw_reg payload,
2655 struct brw_reg surface,
2656 unsigned message_len,
2657 unsigned response_len,
2658 bool header_present)
2659 {
2660 const struct gen_device_info *devinfo = p->devinfo;
2661 struct brw_inst *insn;
2662
2663 if (surface.file != BRW_IMMEDIATE_VALUE) {
2664 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2665
2666 brw_push_insn_state(p);
2667 brw_set_default_access_mode(p, BRW_ALIGN_1);
2668 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2669 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2670 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2671
2672 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2673 * some surface array is accessed out of bounds.
2674 */
2675 insn = brw_AND(p, addr,
2676 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2677 BRW_GET_SWZ(surface.swizzle, 0)),
2678 brw_imm_ud(0xff));
2679
2680 brw_pop_insn_state(p);
2681
2682 surface = addr;
2683 }
2684
2685 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2686 brw_inst_set_mlen(devinfo, insn, message_len);
2687 brw_inst_set_rlen(devinfo, insn, response_len);
2688 brw_inst_set_header_present(devinfo, insn, header_present);
2689
2690 return insn;
2691 }
2692
2693 static bool
2694 while_jumps_before_offset(const struct gen_device_info *devinfo,
2695 brw_inst *insn, int while_offset, int start_offset)
2696 {
2697 int scale = 16 / brw_jump_scale(devinfo);
2698 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2699 : brw_inst_jip(devinfo, insn);
2700 assert(jip < 0);
2701 return while_offset + jip * scale <= start_offset;
2702 }
2703
2704
2705 static int
2706 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2707 {
2708 int offset;
2709 void *store = p->store;
2710 const struct gen_device_info *devinfo = p->devinfo;
2711
2712 int depth = 0;
2713
2714 for (offset = next_offset(devinfo, store, start_offset);
2715 offset < p->next_insn_offset;
2716 offset = next_offset(devinfo, store, offset)) {
2717 brw_inst *insn = store + offset;
2718
2719 switch (brw_inst_opcode(devinfo, insn)) {
2720 case BRW_OPCODE_IF:
2721 depth++;
2722 break;
2723 case BRW_OPCODE_ENDIF:
2724 if (depth == 0)
2725 return offset;
2726 depth--;
2727 break;
2728 case BRW_OPCODE_WHILE:
2729 /* If the while doesn't jump before our instruction, it's the end
2730 * of a sibling do...while loop. Ignore it.
2731 */
2732 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2733 continue;
2734 /* fallthrough */
2735 case BRW_OPCODE_ELSE:
2736 case BRW_OPCODE_HALT:
2737 if (depth == 0)
2738 return offset;
2739 }
2740 }
2741
2742 return 0;
2743 }
2744
2745 /* There is no DO instruction on gen6, so to find the end of the loop
2746 * we have to see if the loop is jumping back before our start
2747 * instruction.
2748 */
2749 static int
2750 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2751 {
2752 const struct gen_device_info *devinfo = p->devinfo;
2753 int offset;
2754 void *store = p->store;
2755
2756 assert(devinfo->gen >= 6);
2757
2758 /* Always start after the instruction (such as a WHILE) we're trying to fix
2759 * up.
2760 */
2761 for (offset = next_offset(devinfo, store, start_offset);
2762 offset < p->next_insn_offset;
2763 offset = next_offset(devinfo, store, offset)) {
2764 brw_inst *insn = store + offset;
2765
2766 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2767 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2768 return offset;
2769 }
2770 }
2771 assert(!"not reached");
2772 return start_offset;
2773 }
2774
2775 /* After program generation, go back and update the UIP and JIP of
2776 * BREAK, CONT, and HALT instructions to their correct locations.
2777 */
2778 void
2779 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2780 {
2781 const struct gen_device_info *devinfo = p->devinfo;
2782 int offset;
2783 int br = brw_jump_scale(devinfo);
2784 int scale = 16 / br;
2785 void *store = p->store;
2786
2787 if (devinfo->gen < 6)
2788 return;
2789
2790 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2791 brw_inst *insn = store + offset;
2792 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2793
2794 int block_end_offset = brw_find_next_block_end(p, offset);
2795 switch (brw_inst_opcode(devinfo, insn)) {
2796 case BRW_OPCODE_BREAK:
2797 assert(block_end_offset != 0);
2798 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2799 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2800 brw_inst_set_uip(devinfo, insn,
2801 (brw_find_loop_end(p, offset) - offset +
2802 (devinfo->gen == 6 ? 16 : 0)) / scale);
2803 break;
2804 case BRW_OPCODE_CONTINUE:
2805 assert(block_end_offset != 0);
2806 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2807 brw_inst_set_uip(devinfo, insn,
2808 (brw_find_loop_end(p, offset) - offset) / scale);
2809
2810 assert(brw_inst_uip(devinfo, insn) != 0);
2811 assert(brw_inst_jip(devinfo, insn) != 0);
2812 break;
2813
2814 case BRW_OPCODE_ENDIF: {
2815 int32_t jump = (block_end_offset == 0) ?
2816 1 * br : (block_end_offset - offset) / scale;
2817 if (devinfo->gen >= 7)
2818 brw_inst_set_jip(devinfo, insn, jump);
2819 else
2820 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2821 break;
2822 }
2823
2824 case BRW_OPCODE_HALT:
2825 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2826 *
2827 * "In case of the halt instruction not inside any conditional
2828 * code block, the value of <JIP> and <UIP> should be the
2829 * same. In case of the halt instruction inside conditional code
2830 * block, the <UIP> should be the end of the program, and the
2831 * <JIP> should be end of the most inner conditional code block."
2832 *
2833 * The uip will have already been set by whoever set up the
2834 * instruction.
2835 */
2836 if (block_end_offset == 0) {
2837 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2838 } else {
2839 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2840 }
2841 assert(brw_inst_uip(devinfo, insn) != 0);
2842 assert(brw_inst_jip(devinfo, insn) != 0);
2843 break;
2844 }
2845 }
2846 }
2847
2848 void brw_ff_sync(struct brw_codegen *p,
2849 struct brw_reg dest,
2850 unsigned msg_reg_nr,
2851 struct brw_reg src0,
2852 bool allocate,
2853 unsigned response_length,
2854 bool eot)
2855 {
2856 const struct gen_device_info *devinfo = p->devinfo;
2857 brw_inst *insn;
2858
2859 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2860
2861 insn = next_insn(p, BRW_OPCODE_SEND);
2862 brw_set_dest(p, insn, dest);
2863 brw_set_src0(p, insn, src0);
2864 brw_set_src1(p, insn, brw_imm_d(0));
2865
2866 if (devinfo->gen < 6)
2867 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2868
2869 brw_set_ff_sync_message(p,
2870 insn,
2871 allocate,
2872 response_length,
2873 eot);
2874 }
2875
2876 /**
2877 * Emit the SEND instruction necessary to generate stream output data on Gen6
2878 * (for transform feedback).
2879 *
2880 * If send_commit_msg is true, this is the last piece of stream output data
2881 * from this thread, so send the data as a committed write. According to the
2882 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2883 *
2884 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2885 * writes are complete by sending the final write as a committed write."
2886 */
2887 void
2888 brw_svb_write(struct brw_codegen *p,
2889 struct brw_reg dest,
2890 unsigned msg_reg_nr,
2891 struct brw_reg src0,
2892 unsigned binding_table_index,
2893 bool send_commit_msg)
2894 {
2895 const struct gen_device_info *devinfo = p->devinfo;
2896 const unsigned target_cache =
2897 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2898 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2899 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2900 brw_inst *insn;
2901
2902 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2903
2904 insn = next_insn(p, BRW_OPCODE_SEND);
2905 brw_set_dest(p, insn, dest);
2906 brw_set_src0(p, insn, src0);
2907 brw_set_src1(p, insn, brw_imm_d(0));
2908 brw_set_dp_write_message(p, insn,
2909 binding_table_index,
2910 0, /* msg_control: ignored */
2911 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2912 target_cache,
2913 1, /* msg_length */
2914 true, /* header_present */
2915 0, /* last_render_target: ignored */
2916 send_commit_msg, /* response_length */
2917 0, /* end_of_thread */
2918 send_commit_msg); /* send_commit_msg */
2919 }
2920
2921 static unsigned
2922 brw_surface_payload_size(struct brw_codegen *p,
2923 unsigned num_channels,
2924 bool has_simd4x2,
2925 bool has_simd16)
2926 {
2927 if (has_simd4x2 && brw_get_default_access_mode(p) == BRW_ALIGN_16)
2928 return 1;
2929 else if (has_simd16 && brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2930 return 2 * num_channels;
2931 else
2932 return num_channels;
2933 }
2934
2935 static void
2936 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2937 brw_inst *insn,
2938 unsigned atomic_op,
2939 bool response_expected)
2940 {
2941 const struct gen_device_info *devinfo = p->devinfo;
2942 unsigned msg_control =
2943 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2944 (response_expected ? 1 << 5 : 0); /* Return data expected */
2945
2946 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2947 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2948 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2949 msg_control |= 1 << 4; /* SIMD8 mode */
2950
2951 brw_inst_set_dp_msg_type(devinfo, insn,
2952 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2953 } else {
2954 brw_inst_set_dp_msg_type(devinfo, insn,
2955 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2956 }
2957 } else {
2958 brw_inst_set_dp_msg_type(devinfo, insn,
2959 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2960
2961 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2962 msg_control |= 1 << 4; /* SIMD8 mode */
2963 }
2964
2965 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2966 }
2967
2968 void
2969 brw_untyped_atomic(struct brw_codegen *p,
2970 struct brw_reg dst,
2971 struct brw_reg payload,
2972 struct brw_reg surface,
2973 unsigned atomic_op,
2974 unsigned msg_length,
2975 bool response_expected,
2976 bool header_present)
2977 {
2978 const struct gen_device_info *devinfo = p->devinfo;
2979 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2980 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2981 GEN7_SFID_DATAPORT_DATA_CACHE);
2982 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2983 /* Mask out unused components -- This is especially important in Align16
2984 * mode on generations that don't have native support for SIMD4x2 atomics,
2985 * because unused but enabled components will cause the dataport to perform
2986 * additional atomic operations on the addresses that happen to be in the
2987 * uninitialized Y, Z and W coordinates of the payload.
2988 */
2989 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2990 struct brw_inst *insn = brw_send_indirect_surface_message(
2991 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2992 brw_surface_payload_size(p, response_expected,
2993 devinfo->gen >= 8 || devinfo->is_haswell, true),
2994 header_present);
2995
2996 brw_set_dp_untyped_atomic_message(
2997 p, insn, atomic_op, response_expected);
2998 }
2999
3000 static void
3001 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
3002 struct brw_inst *insn,
3003 unsigned num_channels)
3004 {
3005 const struct gen_device_info *devinfo = p->devinfo;
3006 /* Set mask of 32-bit channels to drop. */
3007 unsigned msg_control = 0xf & (0xf << num_channels);
3008
3009 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3010 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3011 msg_control |= 1 << 4; /* SIMD16 mode */
3012 else
3013 msg_control |= 2 << 4; /* SIMD8 mode */
3014 }
3015
3016 brw_inst_set_dp_msg_type(devinfo, insn,
3017 (devinfo->gen >= 8 || devinfo->is_haswell ?
3018 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
3019 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
3020 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3021 }
3022
3023 void
3024 brw_untyped_surface_read(struct brw_codegen *p,
3025 struct brw_reg dst,
3026 struct brw_reg payload,
3027 struct brw_reg surface,
3028 unsigned msg_length,
3029 unsigned num_channels)
3030 {
3031 const struct gen_device_info *devinfo = p->devinfo;
3032 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3033 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3034 GEN7_SFID_DATAPORT_DATA_CACHE);
3035 struct brw_inst *insn = brw_send_indirect_surface_message(
3036 p, sfid, dst, payload, surface, msg_length,
3037 brw_surface_payload_size(p, num_channels, true, true),
3038 false);
3039
3040 brw_set_dp_untyped_surface_read_message(
3041 p, insn, num_channels);
3042 }
3043
3044 static void
3045 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3046 struct brw_inst *insn,
3047 unsigned num_channels)
3048 {
3049 const struct gen_device_info *devinfo = p->devinfo;
3050 /* Set mask of 32-bit channels to drop. */
3051 unsigned msg_control = 0xf & (0xf << num_channels);
3052
3053 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3054 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3055 msg_control |= 1 << 4; /* SIMD16 mode */
3056 else
3057 msg_control |= 2 << 4; /* SIMD8 mode */
3058 } else {
3059 if (devinfo->gen >= 8 || devinfo->is_haswell)
3060 msg_control |= 0 << 4; /* SIMD4x2 mode */
3061 else
3062 msg_control |= 2 << 4; /* SIMD8 mode */
3063 }
3064
3065 brw_inst_set_dp_msg_type(devinfo, insn,
3066 devinfo->gen >= 8 || devinfo->is_haswell ?
3067 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3068 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3069 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3070 }
3071
3072 void
3073 brw_untyped_surface_write(struct brw_codegen *p,
3074 struct brw_reg payload,
3075 struct brw_reg surface,
3076 unsigned msg_length,
3077 unsigned num_channels,
3078 bool header_present)
3079 {
3080 const struct gen_device_info *devinfo = p->devinfo;
3081 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3082 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3083 GEN7_SFID_DATAPORT_DATA_CACHE);
3084 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3085 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3086 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3087 WRITEMASK_X : WRITEMASK_XYZW;
3088 struct brw_inst *insn = brw_send_indirect_surface_message(
3089 p, sfid, brw_writemask(brw_null_reg(), mask),
3090 payload, surface, msg_length, 0, header_present);
3091
3092 brw_set_dp_untyped_surface_write_message(
3093 p, insn, num_channels);
3094 }
3095
3096 static unsigned
3097 brw_byte_scattered_data_element_from_bit_size(unsigned bit_size)
3098 {
3099 switch (bit_size) {
3100 case 8:
3101 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_BYTE;
3102 case 16:
3103 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_WORD;
3104 case 32:
3105 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_DWORD;
3106 default:
3107 unreachable("Unsupported bit_size for byte scattered messages");
3108 }
3109 }
3110
3111
3112 void
3113 brw_byte_scattered_read(struct brw_codegen *p,
3114 struct brw_reg dst,
3115 struct brw_reg payload,
3116 struct brw_reg surface,
3117 unsigned msg_length,
3118 unsigned bit_size)
3119 {
3120 const struct gen_device_info *devinfo = p->devinfo;
3121 assert(devinfo->gen > 7 || devinfo->is_haswell);
3122 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3123 const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3124
3125 struct brw_inst *insn = brw_send_indirect_surface_message(
3126 p, sfid, dst, payload, surface, msg_length,
3127 brw_surface_payload_size(p, 1, true, true),
3128 false);
3129
3130 unsigned msg_control =
3131 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3132
3133 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3134 msg_control |= 1; /* SIMD16 mode */
3135 else
3136 msg_control |= 0; /* SIMD8 mode */
3137
3138 brw_inst_set_dp_msg_type(devinfo, insn,
3139 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
3140 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3141 }
3142
3143 void
3144 brw_byte_scattered_write(struct brw_codegen *p,
3145 struct brw_reg payload,
3146 struct brw_reg surface,
3147 unsigned msg_length,
3148 unsigned bit_size,
3149 bool header_present)
3150 {
3151 const struct gen_device_info *devinfo = p->devinfo;
3152 assert(devinfo->gen > 7 || devinfo->is_haswell);
3153 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3154 const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3155
3156 struct brw_inst *insn = brw_send_indirect_surface_message(
3157 p, sfid, brw_writemask(brw_null_reg(), WRITEMASK_XYZW),
3158 payload, surface, msg_length, 0, header_present);
3159
3160 unsigned msg_control =
3161 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3162
3163 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3164 msg_control |= 1;
3165 else
3166 msg_control |= 0;
3167
3168 brw_inst_set_dp_msg_type(devinfo, insn,
3169 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
3170 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3171 }
3172
3173 static void
3174 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3175 struct brw_inst *insn,
3176 unsigned atomic_op,
3177 bool response_expected)
3178 {
3179 const struct gen_device_info *devinfo = p->devinfo;
3180 unsigned msg_control =
3181 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3182 (response_expected ? 1 << 5 : 0); /* Return data expected */
3183
3184 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3185 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3186 if ((brw_get_default_group(p) / 8) % 2 == 1)
3187 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3188
3189 brw_inst_set_dp_msg_type(devinfo, insn,
3190 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3191 } else {
3192 brw_inst_set_dp_msg_type(devinfo, insn,
3193 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3194 }
3195
3196 } else {
3197 brw_inst_set_dp_msg_type(devinfo, insn,
3198 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3199
3200 if ((brw_get_default_group(p) / 8) % 2 == 1)
3201 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3202 }
3203
3204 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3205 }
3206
3207 void
3208 brw_typed_atomic(struct brw_codegen *p,
3209 struct brw_reg dst,
3210 struct brw_reg payload,
3211 struct brw_reg surface,
3212 unsigned atomic_op,
3213 unsigned msg_length,
3214 bool response_expected,
3215 bool header_present) {
3216 const struct gen_device_info *devinfo = p->devinfo;
3217 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3218 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3219 GEN6_SFID_DATAPORT_RENDER_CACHE);
3220 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3221 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3222 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3223 struct brw_inst *insn = brw_send_indirect_surface_message(
3224 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3225 brw_surface_payload_size(p, response_expected,
3226 devinfo->gen >= 8 || devinfo->is_haswell, false),
3227 header_present);
3228
3229 brw_set_dp_typed_atomic_message(
3230 p, insn, atomic_op, response_expected);
3231 }
3232
3233 static void
3234 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3235 struct brw_inst *insn,
3236 unsigned num_channels)
3237 {
3238 const struct gen_device_info *devinfo = p->devinfo;
3239 /* Set mask of unused channels. */
3240 unsigned msg_control = 0xf & (0xf << num_channels);
3241
3242 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3243 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3244 if ((brw_get_default_group(p) / 8) % 2 == 1)
3245 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3246 else
3247 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3248 }
3249
3250 brw_inst_set_dp_msg_type(devinfo, insn,
3251 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3252 } else {
3253 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3254 if ((brw_get_default_group(p) / 8) % 2 == 1)
3255 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3256 }
3257
3258 brw_inst_set_dp_msg_type(devinfo, insn,
3259 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3260 }
3261
3262 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3263 }
3264
3265 void
3266 brw_typed_surface_read(struct brw_codegen *p,
3267 struct brw_reg dst,
3268 struct brw_reg payload,
3269 struct brw_reg surface,
3270 unsigned msg_length,
3271 unsigned num_channels,
3272 bool header_present)
3273 {
3274 const struct gen_device_info *devinfo = p->devinfo;
3275 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3276 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3277 GEN6_SFID_DATAPORT_RENDER_CACHE);
3278 struct brw_inst *insn = brw_send_indirect_surface_message(
3279 p, sfid, dst, payload, surface, msg_length,
3280 brw_surface_payload_size(p, num_channels,
3281 devinfo->gen >= 8 || devinfo->is_haswell, false),
3282 header_present);
3283
3284 brw_set_dp_typed_surface_read_message(
3285 p, insn, num_channels);
3286 }
3287
3288 static void
3289 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3290 struct brw_inst *insn,
3291 unsigned num_channels)
3292 {
3293 const struct gen_device_info *devinfo = p->devinfo;
3294 /* Set mask of unused channels. */
3295 unsigned msg_control = 0xf & (0xf << num_channels);
3296
3297 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3298 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3299 if ((brw_get_default_group(p) / 8) % 2 == 1)
3300 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3301 else
3302 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3303 }
3304
3305 brw_inst_set_dp_msg_type(devinfo, insn,
3306 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3307
3308 } else {
3309 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3310 if ((brw_get_default_group(p) / 8) % 2 == 1)
3311 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3312 }
3313
3314 brw_inst_set_dp_msg_type(devinfo, insn,
3315 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3316 }
3317
3318 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3319 }
3320
3321 void
3322 brw_typed_surface_write(struct brw_codegen *p,
3323 struct brw_reg payload,
3324 struct brw_reg surface,
3325 unsigned msg_length,
3326 unsigned num_channels,
3327 bool header_present)
3328 {
3329 const struct gen_device_info *devinfo = p->devinfo;
3330 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3331 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3332 GEN6_SFID_DATAPORT_RENDER_CACHE);
3333 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3334 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3335 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3336 WRITEMASK_X : WRITEMASK_XYZW);
3337 struct brw_inst *insn = brw_send_indirect_surface_message(
3338 p, sfid, brw_writemask(brw_null_reg(), mask),
3339 payload, surface, msg_length, 0, header_present);
3340
3341 brw_set_dp_typed_surface_write_message(
3342 p, insn, num_channels);
3343 }
3344
3345 static void
3346 brw_set_memory_fence_message(struct brw_codegen *p,
3347 struct brw_inst *insn,
3348 enum brw_message_target sfid,
3349 bool commit_enable)
3350 {
3351 const struct gen_device_info *devinfo = p->devinfo;
3352
3353 brw_set_message_descriptor(p, insn, sfid,
3354 1 /* message length */,
3355 (commit_enable ? 1 : 0) /* response length */,
3356 true /* header present */,
3357 false);
3358
3359 switch (sfid) {
3360 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3361 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3362 break;
3363 case GEN7_SFID_DATAPORT_DATA_CACHE:
3364 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3365 break;
3366 default:
3367 unreachable("Not reached");
3368 }
3369
3370 if (commit_enable)
3371 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3372 }
3373
3374 void
3375 brw_memory_fence(struct brw_codegen *p,
3376 struct brw_reg dst,
3377 enum opcode send_op)
3378 {
3379 const struct gen_device_info *devinfo = p->devinfo;
3380 const bool commit_enable =
3381 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3382 (devinfo->gen == 7 && !devinfo->is_haswell);
3383 struct brw_inst *insn;
3384
3385 brw_push_insn_state(p);
3386 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3387 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3388 dst = vec1(dst);
3389
3390 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3391 * message doesn't write anything back.
3392 */
3393 insn = next_insn(p, send_op);
3394 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3395 brw_set_dest(p, insn, dst);
3396 brw_set_src0(p, insn, dst);
3397 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3398 commit_enable);
3399
3400 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3401 /* IVB does typed surface access through the render cache, so we need to
3402 * flush it too. Use a different register so both flushes can be
3403 * pipelined by the hardware.
3404 */
3405 insn = next_insn(p, send_op);
3406 brw_set_dest(p, insn, offset(dst, 1));
3407 brw_set_src0(p, insn, offset(dst, 1));
3408 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3409 commit_enable);
3410
3411 /* Now write the response of the second message into the response of the
3412 * first to trigger a pipeline stall -- This way future render and data
3413 * cache messages will be properly ordered with respect to past data and
3414 * render cache messages.
3415 */
3416 brw_MOV(p, dst, offset(dst, 1));
3417 }
3418
3419 brw_pop_insn_state(p);
3420 }
3421
3422 void
3423 brw_pixel_interpolator_query(struct brw_codegen *p,
3424 struct brw_reg dest,
3425 struct brw_reg mrf,
3426 bool noperspective,
3427 unsigned mode,
3428 struct brw_reg data,
3429 unsigned msg_length,
3430 unsigned response_length)
3431 {
3432 const struct gen_device_info *devinfo = p->devinfo;
3433 struct brw_inst *insn;
3434 const uint16_t exec_size = brw_get_default_exec_size(p);
3435
3436 /* brw_send_indirect_message will automatically use a direct send message
3437 * if data is actually immediate.
3438 */
3439 insn = brw_send_indirect_message(p,
3440 GEN7_SFID_PIXEL_INTERPOLATOR,
3441 dest,
3442 mrf,
3443 vec1(data));
3444 brw_inst_set_mlen(devinfo, insn, msg_length);
3445 brw_inst_set_rlen(devinfo, insn, response_length);
3446
3447 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3448 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3449 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3450 brw_inst_set_pi_message_type(devinfo, insn, mode);
3451 }
3452
3453 void
3454 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3455 struct brw_reg mask)
3456 {
3457 const struct gen_device_info *devinfo = p->devinfo;
3458 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3459 const unsigned qtr_control = brw_get_default_group(p) / 8;
3460 brw_inst *inst;
3461
3462 assert(devinfo->gen >= 7);
3463 assert(mask.type == BRW_REGISTER_TYPE_UD);
3464
3465 brw_push_insn_state(p);
3466
3467 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3468 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3469
3470 if (devinfo->gen >= 8) {
3471 /* Getting the first active channel index is easy on Gen8: Just find
3472 * the first bit set in the execution mask. The register exists on
3473 * HSW already but it reads back as all ones when the current
3474 * instruction has execution masking disabled, so it's kind of
3475 * useless.
3476 */
3477 struct brw_reg exec_mask =
3478 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3479
3480 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3481 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3482 /* Unfortunately, ce0 does not take into account the thread
3483 * dispatch mask, which may be a problem in cases where it's not
3484 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3485 * some n). Combine ce0 with the given dispatch (or vector) mask
3486 * to mask off those channels which were never dispatched by the
3487 * hardware.
3488 */
3489 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3490 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3491 exec_mask = vec1(dst);
3492 }
3493
3494 /* Quarter control has the effect of magically shifting the value of
3495 * ce0 so you'll get the first active channel relative to the
3496 * specified quarter control as result.
3497 */
3498 inst = brw_FBL(p, vec1(dst), exec_mask);
3499 } else {
3500 const struct brw_reg flag = brw_flag_reg(
3501 brw_inst_flag_reg_nr(devinfo, p->current),
3502 brw_inst_flag_subreg_nr(devinfo, p->current));
3503
3504 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3505 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3506
3507 /* Run enough instructions returning zero with execution masking and
3508 * a conditional modifier enabled in order to get the full execution
3509 * mask in f1.0. We could use a single 32-wide move here if it
3510 * weren't because of the hardware bug that causes channel enables to
3511 * be applied incorrectly to the second half of 32-wide instructions
3512 * on Gen7.
3513 */
3514 const unsigned lower_size = MIN2(16, exec_size);
3515 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3516 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3517 brw_imm_uw(0));
3518 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3519 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3520 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3521 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3522 }
3523
3524 /* Find the first bit set in the exec_size-wide portion of the flag
3525 * register that was updated by the last sequence of MOV
3526 * instructions.
3527 */
3528 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3529 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3530 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3531 }
3532 } else {
3533 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3534
3535 if (devinfo->gen >= 8 &&
3536 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3537 /* In SIMD4x2 mode the first active channel index is just the
3538 * negation of the first bit of the mask register. Note that ce0
3539 * doesn't take into account the dispatch mask, so the Gen7 path
3540 * should be used instead unless you have the guarantee that the
3541 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3542 * for some n).
3543 */
3544 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3545 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3546 brw_imm_ud(1));
3547
3548 } else {
3549 /* Overwrite the destination without and with execution masking to
3550 * find out which of the channels is active.
3551 */
3552 brw_push_insn_state(p);
3553 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3554 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3555 brw_imm_ud(1));
3556
3557 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3558 brw_imm_ud(0));
3559 brw_pop_insn_state(p);
3560 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3561 }
3562 }
3563
3564 brw_pop_insn_state(p);
3565 }
3566
3567 void
3568 brw_broadcast(struct brw_codegen *p,
3569 struct brw_reg dst,
3570 struct brw_reg src,
3571 struct brw_reg idx)
3572 {
3573 const struct gen_device_info *devinfo = p->devinfo;
3574 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3575 brw_inst *inst;
3576
3577 brw_push_insn_state(p);
3578 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3579 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3580
3581 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3582 src.address_mode == BRW_ADDRESS_DIRECT);
3583 assert(!src.abs && !src.negate);
3584 assert(src.type == dst.type);
3585
3586 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3587 idx.file == BRW_IMMEDIATE_VALUE) {
3588 /* Trivial, the source is already uniform or the index is a constant.
3589 * We will typically not get here if the optimizer is doing its job, but
3590 * asserting would be mean.
3591 */
3592 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3593 brw_MOV(p, dst,
3594 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3595 stride(suboffset(src, 4 * i), 0, 4, 1)));
3596 } else {
3597 /* From the Haswell PRM section "Register Region Restrictions":
3598 *
3599 * "The lower bits of the AddressImmediate must not overflow to
3600 * change the register address. The lower 5 bits of Address
3601 * Immediate when added to lower 5 bits of address register gives
3602 * the sub-register offset. The upper bits of Address Immediate
3603 * when added to upper bits of address register gives the register
3604 * address. Any overflow from sub-register offset is dropped."
3605 *
3606 * Fortunately, for broadcast, we never have a sub-register offset so
3607 * this isn't an issue.
3608 */
3609 assert(src.subnr == 0);
3610
3611 if (align1) {
3612 const struct brw_reg addr =
3613 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3614 unsigned offset = src.nr * REG_SIZE + src.subnr;
3615 /* Limit in bytes of the signed indirect addressing immediate. */
3616 const unsigned limit = 512;
3617
3618 brw_push_insn_state(p);
3619 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3620 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3621
3622 /* Take into account the component size and horizontal stride. */
3623 assert(src.vstride == src.hstride + src.width);
3624 brw_SHL(p, addr, vec1(idx),
3625 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3626 src.hstride - 1));
3627
3628 /* We can only address up to limit bytes using the indirect
3629 * addressing immediate, account for the difference if the source
3630 * register is above this limit.
3631 */
3632 if (offset >= limit) {
3633 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3634 offset = offset % limit;
3635 }
3636
3637 brw_pop_insn_state(p);
3638
3639 /* Use indirect addressing to fetch the specified component. */
3640 if (type_sz(src.type) > 4 &&
3641 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3642 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3643 *
3644 * "When source or destination datatype is 64b or operation is
3645 * integer DWord multiply, indirect addressing must not be
3646 * used."
3647 *
3648 * To work around both of this issue, we do two integer MOVs
3649 * insead of one 64-bit MOV. Because no double value should ever
3650 * cross a register boundary, it's safe to use the immediate
3651 * offset in the indirect here to handle adding 4 bytes to the
3652 * offset and avoid the extra ADD to the register file.
3653 */
3654 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3655 retype(brw_vec1_indirect(addr.subnr, offset),
3656 BRW_REGISTER_TYPE_D));
3657 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3658 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3659 BRW_REGISTER_TYPE_D));
3660 } else {
3661 brw_MOV(p, dst,
3662 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3663 }
3664 } else {
3665 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3666 * to all bits of a flag register,
3667 */
3668 inst = brw_MOV(p,
3669 brw_null_reg(),
3670 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3671 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3672 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3673 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3674
3675 /* and use predicated SEL to pick the right channel. */
3676 inst = brw_SEL(p, dst,
3677 stride(suboffset(src, 4), 4, 4, 1),
3678 stride(src, 4, 4, 1));
3679 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3680 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3681 }
3682 }
3683
3684 brw_pop_insn_state(p);
3685 }
3686
3687 /**
3688 * This instruction is generated as a single-channel align1 instruction by
3689 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3690 *
3691 * We can't use the typed atomic op in the FS because that has the execution
3692 * mask ANDed with the pixel mask, but we just want to write the one dword for
3693 * all the pixels.
3694 *
3695 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3696 * one u32. So we use the same untyped atomic write message as the pixel
3697 * shader.
3698 *
3699 * The untyped atomic operation requires a BUFFER surface type with RAW
3700 * format, and is only accessible through the legacy DATA_CACHE dataport
3701 * messages.
3702 */
3703 void brw_shader_time_add(struct brw_codegen *p,
3704 struct brw_reg payload,
3705 uint32_t surf_index)
3706 {
3707 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3708 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3709 GEN7_SFID_DATAPORT_DATA_CACHE);
3710 assert(p->devinfo->gen >= 7);
3711
3712 brw_push_insn_state(p);
3713 brw_set_default_access_mode(p, BRW_ALIGN_1);
3714 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3715 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3716 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3717
3718 /* We use brw_vec1_reg and unmasked because we want to increment the given
3719 * offset only once.
3720 */
3721 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3722 BRW_ARF_NULL, 0));
3723 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3724 payload.nr, 0));
3725 brw_set_src1(p, send, brw_imm_ud(0));
3726 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3727 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3728 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3729
3730 brw_pop_insn_state(p);
3731 }
3732
3733
3734 /**
3735 * Emit the SEND message for a barrier
3736 */
3737 void
3738 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3739 {
3740 const struct gen_device_info *devinfo = p->devinfo;
3741 struct brw_inst *inst;
3742
3743 assert(devinfo->gen >= 7);
3744
3745 brw_push_insn_state(p);
3746 brw_set_default_access_mode(p, BRW_ALIGN_1);
3747 inst = next_insn(p, BRW_OPCODE_SEND);
3748 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3749 brw_set_src0(p, inst, src);
3750 brw_set_src1(p, inst, brw_null_reg());
3751
3752 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3753 1 /* msg_length */,
3754 0 /* response_length */,
3755 false /* header_present */,
3756 false /* end_of_thread */);
3757
3758 brw_inst_set_gateway_notify(devinfo, inst, 1);
3759 brw_inst_set_gateway_subfuncid(devinfo, inst,
3760 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3761
3762 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3763 brw_pop_insn_state(p);
3764 }
3765
3766
3767 /**
3768 * Emit the wait instruction for a barrier
3769 */
3770 void
3771 brw_WAIT(struct brw_codegen *p)
3772 {
3773 const struct gen_device_info *devinfo = p->devinfo;
3774 struct brw_inst *insn;
3775
3776 struct brw_reg src = brw_notification_reg();
3777
3778 insn = next_insn(p, BRW_OPCODE_WAIT);
3779 brw_set_dest(p, insn, src);
3780 brw_set_src0(p, insn, src);
3781 brw_set_src1(p, insn, brw_null_reg());
3782
3783 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3784 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3785 }
3786
3787 /**
3788 * Changes the floating point rounding mode updating the control register
3789 * field defined at cr0.0[5-6] bits. This function supports the changes to
3790 * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3791 * Only RTNE and RTZ rounding are enabled at nir.
3792 */
3793 void
3794 brw_rounding_mode(struct brw_codegen *p,
3795 enum brw_rnd_mode mode)
3796 {
3797 const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
3798
3799 if (bits != BRW_CR0_RND_MODE_MASK) {
3800 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3801 brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
3802 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3803
3804 /* From the Skylake PRM, Volume 7, page 760:
3805 * "Implementation Restriction on Register Access: When the control
3806 * register is used as an explicit source and/or destination, hardware
3807 * does not ensure execution pipeline coherency. Software must set the
3808 * thread control field to ‘switch’ for an instruction that uses
3809 * control register as an explicit operand."
3810 */
3811 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3812 }
3813
3814 if (bits) {
3815 brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3816 brw_imm_ud(bits));
3817 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3818 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3819 }
3820 }