intel/eu: Switch to a logical state stack
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 gen7_convert_mrf_to_grf(p, &dest);
98
99 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
100 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
101
102 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
103 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
104
105 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
106 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
107 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
108 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
109 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
110 } else {
111 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
112 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
113 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_MESSAGE_REGISTER_FILE) {
115 assert(dest.writemask != 0);
116 }
117 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
118 * Although Dst.HorzStride is a don't care for Align16, HW needs
119 * this to be programmed as "01".
120 */
121 brw_inst_set_dst_hstride(devinfo, inst, 1);
122 }
123 } else {
124 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
125
126 /* These are different sizes in align1 vs align16:
127 */
128 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
129 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
130 dest.indirect_offset);
131 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
134 } else {
135 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
136 dest.indirect_offset);
137 /* even ignored in da16, still need to set as '01' */
138 brw_inst_set_dst_hstride(devinfo, inst, 1);
139 }
140 }
141
142 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
143 * or 16 (SIMD16), as that's normally correct. However, when dealing with
144 * small registers, it can be useful for us to automatically reduce it to
145 * match the register size.
146 */
147 if (p->automatic_exec_sizes) {
148 /*
149 * In platforms that support fp64 we can emit instructions with a width
150 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
151 * these cases we need to make sure that these instructions have their
152 * exec sizes set properly when they are emitted and we can't rely on
153 * this code to fix it.
154 */
155 bool fix_exec_size;
156 if (devinfo->gen >= 6)
157 fix_exec_size = dest.width < BRW_EXECUTE_4;
158 else
159 fix_exec_size = dest.width < BRW_EXECUTE_8;
160
161 if (fix_exec_size)
162 brw_inst_set_exec_size(devinfo, inst, dest.width);
163 }
164 }
165
166 void
167 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
168 {
169 const struct gen_device_info *devinfo = p->devinfo;
170
171 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
172 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
173 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
174 assert(reg.nr < 128);
175
176 gen7_convert_mrf_to_grf(p, &reg);
177
178 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
179 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
180 /* Any source modifiers or regions will be ignored, since this just
181 * identifies the MRF/GRF to start reading the message contents from.
182 * Check for some likely failures.
183 */
184 assert(!reg.negate);
185 assert(!reg.abs);
186 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
187 }
188
189 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
190 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
191 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
192 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
193
194 if (reg.file == BRW_IMMEDIATE_VALUE) {
195 if (reg.type == BRW_REGISTER_TYPE_DF ||
196 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
197 brw_inst_set_imm_df(devinfo, inst, reg.df);
198 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
199 reg.type == BRW_REGISTER_TYPE_Q)
200 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
201 else
202 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
203
204 if (type_sz(reg.type) < 8) {
205 brw_inst_set_src1_reg_file(devinfo, inst,
206 BRW_ARCHITECTURE_REGISTER_FILE);
207 brw_inst_set_src1_reg_hw_type(devinfo, inst,
208 brw_inst_src0_reg_hw_type(devinfo, inst));
209 }
210 } else {
211 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
212 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
213 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
214 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
215 } else {
216 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
217 }
218 } else {
219 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
220
221 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
222 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
223 } else {
224 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
225 }
226 }
227
228 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
229 if (reg.width == BRW_WIDTH_1 &&
230 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
231 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
232 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
233 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
234 } else {
235 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
236 brw_inst_set_src0_width(devinfo, inst, reg.width);
237 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
238 }
239 } else {
240 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
241 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
242 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
243 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
244 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
245 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
246 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
247 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
248
249 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
250 /* This is an oddity of the fact we're using the same
251 * descriptions for registers in align_16 as align_1:
252 */
253 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
254 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
255 reg.type == BRW_REGISTER_TYPE_DF &&
256 reg.vstride == BRW_VERTICAL_STRIDE_2) {
257 /* From SNB PRM:
258 *
259 * "For Align16 access mode, only encodings of 0000 and 0011
260 * are allowed. Other codes are reserved."
261 *
262 * Presumably the DevSNB behavior applies to IVB as well.
263 */
264 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
265 } else {
266 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
267 }
268 }
269 }
270 }
271
272
273 void
274 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
275 {
276 const struct gen_device_info *devinfo = p->devinfo;
277
278 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
279 assert(reg.nr < 128);
280
281 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
282 *
283 * "Accumulator registers may be accessed explicitly as src0
284 * operands only."
285 */
286 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
287 reg.nr != BRW_ARF_ACCUMULATOR);
288
289 gen7_convert_mrf_to_grf(p, &reg);
290 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
291
292 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
293 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
294 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
295
296 /* Only src1 can be immediate in two-argument instructions.
297 */
298 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
299
300 if (reg.file == BRW_IMMEDIATE_VALUE) {
301 /* two-argument instructions can only use 32-bit immediates */
302 assert(type_sz(reg.type) < 8);
303 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
304 } else {
305 /* This is a hardware restriction, which may or may not be lifted
306 * in the future:
307 */
308 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
309 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
310
311 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
312 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
313 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
314 } else {
315 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
316 }
317
318 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
319 if (reg.width == BRW_WIDTH_1 &&
320 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
321 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
322 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
323 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
324 } else {
325 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
326 brw_inst_set_src1_width(devinfo, inst, reg.width);
327 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
328 }
329 } else {
330 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
331 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
332 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
333 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
334 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
335 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
336 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
337 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
338
339 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
340 /* This is an oddity of the fact we're using the same
341 * descriptions for registers in align_16 as align_1:
342 */
343 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
344 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
345 reg.type == BRW_REGISTER_TYPE_DF &&
346 reg.vstride == BRW_VERTICAL_STRIDE_2) {
347 /* From SNB PRM:
348 *
349 * "For Align16 access mode, only encodings of 0000 and 0011
350 * are allowed. Other codes are reserved."
351 *
352 * Presumably the DevSNB behavior applies to IVB as well.
353 */
354 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
355 } else {
356 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
357 }
358 }
359 }
360 }
361
362 /**
363 * Set the Message Descriptor and Extended Message Descriptor fields
364 * for SEND messages.
365 *
366 * \note This zeroes out the Function Control bits, so it must be called
367 * \b before filling out any message-specific data. Callers can
368 * choose not to fill in irrelevant bits; they will be zero.
369 */
370 void
371 brw_set_message_descriptor(struct brw_codegen *p,
372 brw_inst *inst,
373 enum brw_message_target sfid,
374 unsigned msg_length,
375 unsigned response_length,
376 bool header_present,
377 bool end_of_thread)
378 {
379 const struct gen_device_info *devinfo = p->devinfo;
380
381 brw_set_src1(p, inst, brw_imm_d(0));
382
383 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
384 * itself; instead, it will be a MOV/OR into the address register.
385 *
386 * In this case, we avoid setting the extended message descriptor bits,
387 * since they go on the later SEND/SENDC instead and if set here would
388 * instead clobber the conditionalmod bits.
389 */
390 unsigned opcode = brw_inst_opcode(devinfo, inst);
391 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
392 brw_inst_set_sfid(devinfo, inst, sfid);
393 }
394
395 brw_inst_set_mlen(devinfo, inst, msg_length);
396 brw_inst_set_rlen(devinfo, inst, response_length);
397 brw_inst_set_eot(devinfo, inst, end_of_thread);
398
399 if (devinfo->gen >= 5) {
400 brw_inst_set_header_present(devinfo, inst, header_present);
401 }
402 }
403
404 static void brw_set_math_message( struct brw_codegen *p,
405 brw_inst *inst,
406 unsigned function,
407 unsigned integer_type,
408 bool low_precision,
409 unsigned dataType )
410 {
411 const struct gen_device_info *devinfo = p->devinfo;
412 unsigned msg_length;
413 unsigned response_length;
414
415 /* Infer message length from the function */
416 switch (function) {
417 case BRW_MATH_FUNCTION_POW:
418 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
419 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
420 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
421 msg_length = 2;
422 break;
423 default:
424 msg_length = 1;
425 break;
426 }
427
428 /* Infer response length from the function */
429 switch (function) {
430 case BRW_MATH_FUNCTION_SINCOS:
431 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
432 response_length = 2;
433 break;
434 default:
435 response_length = 1;
436 break;
437 }
438
439
440 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
441 msg_length, response_length, false, false);
442 brw_inst_set_math_msg_function(devinfo, inst, function);
443 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
444 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
445 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
446 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
447 brw_inst_set_saturate(devinfo, inst, 0);
448 }
449
450
451 static void brw_set_ff_sync_message(struct brw_codegen *p,
452 brw_inst *insn,
453 bool allocate,
454 unsigned response_length,
455 bool end_of_thread)
456 {
457 const struct gen_device_info *devinfo = p->devinfo;
458
459 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
460 1, response_length, true, end_of_thread);
461 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
462 brw_inst_set_urb_allocate(devinfo, insn, allocate);
463 /* The following fields are not used by FF_SYNC: */
464 brw_inst_set_urb_global_offset(devinfo, insn, 0);
465 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
466 brw_inst_set_urb_used(devinfo, insn, 0);
467 brw_inst_set_urb_complete(devinfo, insn, 0);
468 }
469
470 static void brw_set_urb_message( struct brw_codegen *p,
471 brw_inst *insn,
472 enum brw_urb_write_flags flags,
473 unsigned msg_length,
474 unsigned response_length,
475 unsigned offset,
476 unsigned swizzle_control )
477 {
478 const struct gen_device_info *devinfo = p->devinfo;
479
480 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
481 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
482 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
483
484 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
485 msg_length, response_length, true,
486 flags & BRW_URB_WRITE_EOT);
487
488 if (flags & BRW_URB_WRITE_OWORD) {
489 assert(msg_length == 2); /* header + one OWORD of data */
490 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
491 } else {
492 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
493 }
494
495 brw_inst_set_urb_global_offset(devinfo, insn, offset);
496 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
497
498 if (devinfo->gen < 8) {
499 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
500 }
501
502 if (devinfo->gen < 7) {
503 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
504 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
505 } else {
506 brw_inst_set_urb_per_slot_offset(devinfo, insn,
507 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
508 }
509 }
510
511 void
512 brw_set_dp_write_message(struct brw_codegen *p,
513 brw_inst *insn,
514 unsigned binding_table_index,
515 unsigned msg_control,
516 unsigned msg_type,
517 unsigned target_cache,
518 unsigned msg_length,
519 bool header_present,
520 unsigned last_render_target,
521 unsigned response_length,
522 unsigned end_of_thread,
523 unsigned send_commit_msg)
524 {
525 const struct gen_device_info *devinfo = p->devinfo;
526 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
527 BRW_SFID_DATAPORT_WRITE);
528
529 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
530 header_present, end_of_thread);
531
532 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
533 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
534 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
535 brw_inst_set_rt_last(devinfo, insn, last_render_target);
536 if (devinfo->gen < 7) {
537 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
538 }
539
540 if (devinfo->gen >= 11)
541 brw_inst_set_null_rt(devinfo, insn, false);
542 }
543
544 void
545 brw_set_dp_read_message(struct brw_codegen *p,
546 brw_inst *insn,
547 unsigned binding_table_index,
548 unsigned msg_control,
549 unsigned msg_type,
550 unsigned target_cache,
551 unsigned msg_length,
552 bool header_present,
553 unsigned response_length)
554 {
555 const struct gen_device_info *devinfo = p->devinfo;
556 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
557 BRW_SFID_DATAPORT_READ);
558
559 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
560 header_present, false);
561
562 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
563 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
564 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
565 if (devinfo->gen < 6)
566 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
567 }
568
569 void
570 brw_set_sampler_message(struct brw_codegen *p,
571 brw_inst *inst,
572 unsigned binding_table_index,
573 unsigned sampler,
574 unsigned msg_type,
575 unsigned response_length,
576 unsigned msg_length,
577 unsigned header_present,
578 unsigned simd_mode,
579 unsigned return_format)
580 {
581 const struct gen_device_info *devinfo = p->devinfo;
582
583 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
584 response_length, header_present, false);
585
586 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
587 brw_inst_set_sampler(devinfo, inst, sampler);
588 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
589 if (devinfo->gen >= 5) {
590 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
591 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
592 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
593 }
594 }
595
596 static void
597 gen7_set_dp_scratch_message(struct brw_codegen *p,
598 brw_inst *inst,
599 bool write,
600 bool dword,
601 bool invalidate_after_read,
602 unsigned num_regs,
603 unsigned addr_offset,
604 unsigned mlen,
605 unsigned rlen,
606 bool header_present)
607 {
608 const struct gen_device_info *devinfo = p->devinfo;
609 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
610 (devinfo->gen >= 8 && num_regs == 8));
611 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
612 num_regs - 1);
613
614 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
615 mlen, rlen, header_present, false);
616 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
617 brw_inst_set_scratch_read_write(devinfo, inst, write);
618 brw_inst_set_scratch_type(devinfo, inst, dword);
619 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
620 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
621 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
622 }
623
624 static void
625 brw_inst_set_state(const struct gen_device_info *devinfo,
626 brw_inst *insn,
627 const struct brw_insn_state *state)
628 {
629 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
630 brw_inst_set_group(devinfo, insn, state->group);
631 brw_inst_set_compression(devinfo, insn, state->compressed);
632 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
633 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
634 brw_inst_set_saturate(devinfo, insn, state->saturate);
635 brw_inst_set_pred_control(devinfo, insn, state->predicate);
636 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
637
638 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
639 state->access_mode == BRW_ALIGN_16) {
640 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
641 if (devinfo->gen >= 7)
642 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
643 } else {
644 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
645 if (devinfo->gen >= 7)
646 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
647 }
648
649 if (devinfo->gen >= 6)
650 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
651 }
652
653 #define next_insn brw_next_insn
654 brw_inst *
655 brw_next_insn(struct brw_codegen *p, unsigned opcode)
656 {
657 const struct gen_device_info *devinfo = p->devinfo;
658 brw_inst *insn;
659
660 if (p->nr_insn + 1 > p->store_size) {
661 p->store_size <<= 1;
662 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
663 }
664
665 p->next_insn_offset += 16;
666 insn = &p->store[p->nr_insn++];
667
668 memset(insn, 0, sizeof(*insn));
669 brw_inst_set_opcode(devinfo, insn, opcode);
670
671 /* Apply the default instruction state */
672 brw_inst_set_state(devinfo, insn, p->current);
673
674 return insn;
675 }
676
677 static brw_inst *
678 brw_alu1(struct brw_codegen *p, unsigned opcode,
679 struct brw_reg dest, struct brw_reg src)
680 {
681 brw_inst *insn = next_insn(p, opcode);
682 brw_set_dest(p, insn, dest);
683 brw_set_src0(p, insn, src);
684 return insn;
685 }
686
687 static brw_inst *
688 brw_alu2(struct brw_codegen *p, unsigned opcode,
689 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
690 {
691 /* 64-bit immediates are only supported on 1-src instructions */
692 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
693 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
694
695 brw_inst *insn = next_insn(p, opcode);
696 brw_set_dest(p, insn, dest);
697 brw_set_src0(p, insn, src0);
698 brw_set_src1(p, insn, src1);
699 return insn;
700 }
701
702 static int
703 get_3src_subreg_nr(struct brw_reg reg)
704 {
705 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
706 * use 32-bit units (components 0..7). Since they only support F/D/UD
707 * types, this doesn't lose any flexibility, but uses fewer bits.
708 */
709 return reg.subnr / 4;
710 }
711
712 static enum gen10_align1_3src_vertical_stride
713 to_3src_align1_vstride(enum brw_vertical_stride vstride)
714 {
715 switch (vstride) {
716 case BRW_VERTICAL_STRIDE_0:
717 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
718 case BRW_VERTICAL_STRIDE_2:
719 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
720 case BRW_VERTICAL_STRIDE_4:
721 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
722 case BRW_VERTICAL_STRIDE_8:
723 case BRW_VERTICAL_STRIDE_16:
724 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
725 default:
726 unreachable("invalid vstride");
727 }
728 }
729
730
731 static enum gen10_align1_3src_src_horizontal_stride
732 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
733 {
734 switch (hstride) {
735 case BRW_HORIZONTAL_STRIDE_0:
736 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
737 case BRW_HORIZONTAL_STRIDE_1:
738 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
739 case BRW_HORIZONTAL_STRIDE_2:
740 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
741 case BRW_HORIZONTAL_STRIDE_4:
742 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
743 default:
744 unreachable("invalid hstride");
745 }
746 }
747
748 static brw_inst *
749 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
750 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
751 {
752 const struct gen_device_info *devinfo = p->devinfo;
753 brw_inst *inst = next_insn(p, opcode);
754
755 gen7_convert_mrf_to_grf(p, &dest);
756
757 assert(dest.nr < 128);
758 assert(src0.nr < 128);
759 assert(src1.nr < 128);
760 assert(src2.nr < 128);
761 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
762 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
763 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
764 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
765
766 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
767 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
768 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
769
770 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
771 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
772 BRW_ALIGN1_3SRC_ACCUMULATOR);
773 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
774 } else {
775 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
776 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
777 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
778 }
779 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
780
781 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
782
783 if (brw_reg_type_is_floating_point(dest.type)) {
784 brw_inst_set_3src_a1_exec_type(devinfo, inst,
785 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
786 } else {
787 brw_inst_set_3src_a1_exec_type(devinfo, inst,
788 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
789 }
790
791 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
792 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
793 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
794 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
795
796 brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
797 to_3src_align1_vstride(src0.vstride));
798 brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
799 to_3src_align1_vstride(src1.vstride));
800 /* no vstride on src2 */
801
802 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
803 to_3src_align1_hstride(src0.hstride));
804 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
805 to_3src_align1_hstride(src1.hstride));
806 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
807 to_3src_align1_hstride(src2.hstride));
808
809 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
810 if (src0.type == BRW_REGISTER_TYPE_NF) {
811 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
812 } else {
813 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
814 }
815 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
816 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
817
818 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
819 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
820 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
821 } else {
822 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
823 }
824 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
825 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
826
827 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
828 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
829 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
830 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
831
832 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
833 src0.file == BRW_IMMEDIATE_VALUE ||
834 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
835 src0.type == BRW_REGISTER_TYPE_NF));
836 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
837 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
838 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
839 src2.file == BRW_IMMEDIATE_VALUE);
840
841 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
842 src0.file == BRW_GENERAL_REGISTER_FILE ?
843 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
844 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
845 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
846 src1.file == BRW_GENERAL_REGISTER_FILE ?
847 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
848 BRW_ALIGN1_3SRC_ACCUMULATOR);
849 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
850 src2.file == BRW_GENERAL_REGISTER_FILE ?
851 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
852 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
853 } else {
854 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
855 dest.file == BRW_MESSAGE_REGISTER_FILE);
856 assert(dest.type == BRW_REGISTER_TYPE_F ||
857 dest.type == BRW_REGISTER_TYPE_DF ||
858 dest.type == BRW_REGISTER_TYPE_D ||
859 dest.type == BRW_REGISTER_TYPE_UD);
860 if (devinfo->gen == 6) {
861 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
862 dest.file == BRW_MESSAGE_REGISTER_FILE);
863 }
864 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
865 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
866 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
867
868 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
869 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
870 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
871 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
872 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
873 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
874 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
875 src0.vstride == BRW_VERTICAL_STRIDE_0);
876
877 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
878 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
879 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
880 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
881 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
882 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
883 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
884 src1.vstride == BRW_VERTICAL_STRIDE_0);
885
886 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
887 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
888 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
889 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
890 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
891 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
892 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
893 src2.vstride == BRW_VERTICAL_STRIDE_0);
894
895 if (devinfo->gen >= 7) {
896 /* Set both the source and destination types based on dest.type,
897 * ignoring the source register types. The MAD and LRP emitters ensure
898 * that all four types are float. The BFE and BFI2 emitters, however,
899 * may send us mixed D and UD types and want us to ignore that and use
900 * the destination type.
901 */
902 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
903 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
904 }
905 }
906
907 return inst;
908 }
909
910
911 /***********************************************************************
912 * Convenience routines.
913 */
914 #define ALU1(OP) \
915 brw_inst *brw_##OP(struct brw_codegen *p, \
916 struct brw_reg dest, \
917 struct brw_reg src0) \
918 { \
919 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
920 }
921
922 #define ALU2(OP) \
923 brw_inst *brw_##OP(struct brw_codegen *p, \
924 struct brw_reg dest, \
925 struct brw_reg src0, \
926 struct brw_reg src1) \
927 { \
928 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
929 }
930
931 #define ALU3(OP) \
932 brw_inst *brw_##OP(struct brw_codegen *p, \
933 struct brw_reg dest, \
934 struct brw_reg src0, \
935 struct brw_reg src1, \
936 struct brw_reg src2) \
937 { \
938 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
939 }
940
941 #define ALU3F(OP) \
942 brw_inst *brw_##OP(struct brw_codegen *p, \
943 struct brw_reg dest, \
944 struct brw_reg src0, \
945 struct brw_reg src1, \
946 struct brw_reg src2) \
947 { \
948 assert(dest.type == BRW_REGISTER_TYPE_F || \
949 dest.type == BRW_REGISTER_TYPE_DF); \
950 if (dest.type == BRW_REGISTER_TYPE_F) { \
951 assert(src0.type == BRW_REGISTER_TYPE_F); \
952 assert(src1.type == BRW_REGISTER_TYPE_F); \
953 assert(src2.type == BRW_REGISTER_TYPE_F); \
954 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
955 assert(src0.type == BRW_REGISTER_TYPE_DF); \
956 assert(src1.type == BRW_REGISTER_TYPE_DF); \
957 assert(src2.type == BRW_REGISTER_TYPE_DF); \
958 } \
959 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
960 }
961
962 /* Rounding operations (other than RNDD) require two instructions - the first
963 * stores a rounded value (possibly the wrong way) in the dest register, but
964 * also sets a per-channel "increment bit" in the flag register. A predicated
965 * add of 1.0 fixes dest to contain the desired result.
966 *
967 * Sandybridge and later appear to round correctly without an ADD.
968 */
969 #define ROUND(OP) \
970 void brw_##OP(struct brw_codegen *p, \
971 struct brw_reg dest, \
972 struct brw_reg src) \
973 { \
974 const struct gen_device_info *devinfo = p->devinfo; \
975 brw_inst *rnd, *add; \
976 rnd = next_insn(p, BRW_OPCODE_##OP); \
977 brw_set_dest(p, rnd, dest); \
978 brw_set_src0(p, rnd, src); \
979 \
980 if (devinfo->gen < 6) { \
981 /* turn on round-increments */ \
982 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
983 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
984 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
985 } \
986 }
987
988
989 ALU2(SEL)
990 ALU1(NOT)
991 ALU2(AND)
992 ALU2(OR)
993 ALU2(XOR)
994 ALU2(SHR)
995 ALU2(SHL)
996 ALU1(DIM)
997 ALU2(ASR)
998 ALU3(CSEL)
999 ALU1(FRC)
1000 ALU1(RNDD)
1001 ALU2(MAC)
1002 ALU2(MACH)
1003 ALU1(LZD)
1004 ALU2(DP4)
1005 ALU2(DPH)
1006 ALU2(DP3)
1007 ALU2(DP2)
1008 ALU3(MAD)
1009 ALU3F(LRP)
1010 ALU1(BFREV)
1011 ALU3(BFE)
1012 ALU2(BFI1)
1013 ALU3(BFI2)
1014 ALU1(FBH)
1015 ALU1(FBL)
1016 ALU1(CBIT)
1017 ALU2(ADDC)
1018 ALU2(SUBB)
1019
1020 ROUND(RNDZ)
1021 ROUND(RNDE)
1022
1023 brw_inst *
1024 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1025 {
1026 const struct gen_device_info *devinfo = p->devinfo;
1027
1028 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1029 * To avoid the problems that causes, we use a <1,2,0> source region to read
1030 * each element twice.
1031 */
1032 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1033 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1034 dest.type == BRW_REGISTER_TYPE_DF &&
1035 (src0.type == BRW_REGISTER_TYPE_F ||
1036 src0.type == BRW_REGISTER_TYPE_D ||
1037 src0.type == BRW_REGISTER_TYPE_UD) &&
1038 !has_scalar_region(src0)) {
1039 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
1040 src0.width == BRW_WIDTH_4 &&
1041 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1042
1043 src0.vstride = BRW_VERTICAL_STRIDE_1;
1044 src0.width = BRW_WIDTH_2;
1045 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1046 }
1047
1048 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1049 }
1050
1051 brw_inst *
1052 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1053 struct brw_reg src0, struct brw_reg src1)
1054 {
1055 /* 6.2.2: add */
1056 if (src0.type == BRW_REGISTER_TYPE_F ||
1057 (src0.file == BRW_IMMEDIATE_VALUE &&
1058 src0.type == BRW_REGISTER_TYPE_VF)) {
1059 assert(src1.type != BRW_REGISTER_TYPE_UD);
1060 assert(src1.type != BRW_REGISTER_TYPE_D);
1061 }
1062
1063 if (src1.type == BRW_REGISTER_TYPE_F ||
1064 (src1.file == BRW_IMMEDIATE_VALUE &&
1065 src1.type == BRW_REGISTER_TYPE_VF)) {
1066 assert(src0.type != BRW_REGISTER_TYPE_UD);
1067 assert(src0.type != BRW_REGISTER_TYPE_D);
1068 }
1069
1070 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1071 }
1072
1073 brw_inst *
1074 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1075 struct brw_reg src0, struct brw_reg src1)
1076 {
1077 assert(dest.type == src0.type);
1078 assert(src0.type == src1.type);
1079 switch (src0.type) {
1080 case BRW_REGISTER_TYPE_B:
1081 case BRW_REGISTER_TYPE_UB:
1082 case BRW_REGISTER_TYPE_W:
1083 case BRW_REGISTER_TYPE_UW:
1084 case BRW_REGISTER_TYPE_D:
1085 case BRW_REGISTER_TYPE_UD:
1086 break;
1087 default:
1088 unreachable("Bad type for brw_AVG");
1089 }
1090
1091 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1092 }
1093
1094 brw_inst *
1095 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1096 struct brw_reg src0, struct brw_reg src1)
1097 {
1098 /* 6.32.38: mul */
1099 if (src0.type == BRW_REGISTER_TYPE_D ||
1100 src0.type == BRW_REGISTER_TYPE_UD ||
1101 src1.type == BRW_REGISTER_TYPE_D ||
1102 src1.type == BRW_REGISTER_TYPE_UD) {
1103 assert(dest.type != BRW_REGISTER_TYPE_F);
1104 }
1105
1106 if (src0.type == BRW_REGISTER_TYPE_F ||
1107 (src0.file == BRW_IMMEDIATE_VALUE &&
1108 src0.type == BRW_REGISTER_TYPE_VF)) {
1109 assert(src1.type != BRW_REGISTER_TYPE_UD);
1110 assert(src1.type != BRW_REGISTER_TYPE_D);
1111 }
1112
1113 if (src1.type == BRW_REGISTER_TYPE_F ||
1114 (src1.file == BRW_IMMEDIATE_VALUE &&
1115 src1.type == BRW_REGISTER_TYPE_VF)) {
1116 assert(src0.type != BRW_REGISTER_TYPE_UD);
1117 assert(src0.type != BRW_REGISTER_TYPE_D);
1118 }
1119
1120 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1121 src0.nr != BRW_ARF_ACCUMULATOR);
1122 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1123 src1.nr != BRW_ARF_ACCUMULATOR);
1124
1125 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1126 }
1127
1128 brw_inst *
1129 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1130 struct brw_reg src0, struct brw_reg src1)
1131 {
1132 src0.vstride = BRW_VERTICAL_STRIDE_0;
1133 src0.width = BRW_WIDTH_1;
1134 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1135 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1136 }
1137
1138 brw_inst *
1139 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1140 struct brw_reg src0, struct brw_reg src1)
1141 {
1142 src0.vstride = BRW_VERTICAL_STRIDE_0;
1143 src0.width = BRW_WIDTH_1;
1144 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1145 src1.vstride = BRW_VERTICAL_STRIDE_8;
1146 src1.width = BRW_WIDTH_8;
1147 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1148 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1149 }
1150
1151 brw_inst *
1152 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1153 {
1154 const struct gen_device_info *devinfo = p->devinfo;
1155 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1156 /* The F32TO16 instruction doesn't support 32-bit destination types in
1157 * Align1 mode, and neither does the Gen8 implementation in terms of a
1158 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1159 * an undocumented feature.
1160 */
1161 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1162 (!align16 || devinfo->gen >= 8));
1163 brw_inst *inst;
1164
1165 if (align16) {
1166 assert(dst.type == BRW_REGISTER_TYPE_UD);
1167 } else {
1168 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1169 dst.type == BRW_REGISTER_TYPE_W ||
1170 dst.type == BRW_REGISTER_TYPE_UW ||
1171 dst.type == BRW_REGISTER_TYPE_HF);
1172 }
1173
1174 brw_push_insn_state(p);
1175
1176 if (needs_zero_fill) {
1177 brw_set_default_access_mode(p, BRW_ALIGN_1);
1178 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1179 }
1180
1181 if (devinfo->gen >= 8) {
1182 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1183 } else {
1184 assert(devinfo->gen == 7);
1185 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1186 }
1187
1188 if (needs_zero_fill) {
1189 brw_inst_set_no_dd_clear(devinfo, inst, true);
1190 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1191 brw_inst_set_no_dd_check(devinfo, inst, true);
1192 }
1193
1194 brw_pop_insn_state(p);
1195 return inst;
1196 }
1197
1198 brw_inst *
1199 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1200 {
1201 const struct gen_device_info *devinfo = p->devinfo;
1202 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1203
1204 if (align16) {
1205 assert(src.type == BRW_REGISTER_TYPE_UD);
1206 } else {
1207 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1208 *
1209 * Because this instruction does not have a 16-bit floating-point
1210 * type, the source data type must be Word (W). The destination type
1211 * must be F (Float).
1212 */
1213 if (src.type == BRW_REGISTER_TYPE_UD)
1214 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1215
1216 assert(src.type == BRW_REGISTER_TYPE_W ||
1217 src.type == BRW_REGISTER_TYPE_UW ||
1218 src.type == BRW_REGISTER_TYPE_HF);
1219 }
1220
1221 if (devinfo->gen >= 8) {
1222 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1223 } else {
1224 assert(devinfo->gen == 7);
1225 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1226 }
1227 }
1228
1229
1230 void brw_NOP(struct brw_codegen *p)
1231 {
1232 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1233 memset(insn, 0, sizeof(*insn));
1234 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1235 }
1236
1237
1238
1239
1240
1241 /***********************************************************************
1242 * Comparisons, if/else/endif
1243 */
1244
1245 brw_inst *
1246 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1247 unsigned predicate_control)
1248 {
1249 const struct gen_device_info *devinfo = p->devinfo;
1250 struct brw_reg ip = brw_ip_reg();
1251 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1252
1253 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1254 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1255 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1256 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1257
1258 return inst;
1259 }
1260
1261 static void
1262 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1263 {
1264 p->if_stack[p->if_stack_depth] = inst - p->store;
1265
1266 p->if_stack_depth++;
1267 if (p->if_stack_array_size <= p->if_stack_depth) {
1268 p->if_stack_array_size *= 2;
1269 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1270 p->if_stack_array_size);
1271 }
1272 }
1273
1274 static brw_inst *
1275 pop_if_stack(struct brw_codegen *p)
1276 {
1277 p->if_stack_depth--;
1278 return &p->store[p->if_stack[p->if_stack_depth]];
1279 }
1280
1281 static void
1282 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1283 {
1284 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1285 p->loop_stack_array_size *= 2;
1286 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1287 p->loop_stack_array_size);
1288 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1289 p->loop_stack_array_size);
1290 }
1291
1292 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1293 p->loop_stack_depth++;
1294 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1295 }
1296
1297 static brw_inst *
1298 get_inner_do_insn(struct brw_codegen *p)
1299 {
1300 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1301 }
1302
1303 /* EU takes the value from the flag register and pushes it onto some
1304 * sort of a stack (presumably merging with any flag value already on
1305 * the stack). Within an if block, the flags at the top of the stack
1306 * control execution on each channel of the unit, eg. on each of the
1307 * 16 pixel values in our wm programs.
1308 *
1309 * When the matching 'else' instruction is reached (presumably by
1310 * countdown of the instruction count patched in by our ELSE/ENDIF
1311 * functions), the relevant flags are inverted.
1312 *
1313 * When the matching 'endif' instruction is reached, the flags are
1314 * popped off. If the stack is now empty, normal execution resumes.
1315 */
1316 brw_inst *
1317 brw_IF(struct brw_codegen *p, unsigned execute_size)
1318 {
1319 const struct gen_device_info *devinfo = p->devinfo;
1320 brw_inst *insn;
1321
1322 insn = next_insn(p, BRW_OPCODE_IF);
1323
1324 /* Override the defaults for this instruction:
1325 */
1326 if (devinfo->gen < 6) {
1327 brw_set_dest(p, insn, brw_ip_reg());
1328 brw_set_src0(p, insn, brw_ip_reg());
1329 brw_set_src1(p, insn, brw_imm_d(0x0));
1330 } else if (devinfo->gen == 6) {
1331 brw_set_dest(p, insn, brw_imm_w(0));
1332 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1333 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1334 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1335 } else if (devinfo->gen == 7) {
1336 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1337 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1338 brw_set_src1(p, insn, brw_imm_w(0));
1339 brw_inst_set_jip(devinfo, insn, 0);
1340 brw_inst_set_uip(devinfo, insn, 0);
1341 } else {
1342 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1343 brw_set_src0(p, insn, brw_imm_d(0));
1344 brw_inst_set_jip(devinfo, insn, 0);
1345 brw_inst_set_uip(devinfo, insn, 0);
1346 }
1347
1348 brw_inst_set_exec_size(devinfo, insn, execute_size);
1349 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1350 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1351 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1352 if (!p->single_program_flow && devinfo->gen < 6)
1353 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1354
1355 push_if_stack(p, insn);
1356 p->if_depth_in_loop[p->loop_stack_depth]++;
1357 return insn;
1358 }
1359
1360 /* This function is only used for gen6-style IF instructions with an
1361 * embedded comparison (conditional modifier). It is not used on gen7.
1362 */
1363 brw_inst *
1364 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1365 struct brw_reg src0, struct brw_reg src1)
1366 {
1367 const struct gen_device_info *devinfo = p->devinfo;
1368 brw_inst *insn;
1369
1370 insn = next_insn(p, BRW_OPCODE_IF);
1371
1372 brw_set_dest(p, insn, brw_imm_w(0));
1373 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1374 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1375 brw_set_src0(p, insn, src0);
1376 brw_set_src1(p, insn, src1);
1377
1378 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1379 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1380 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1381
1382 push_if_stack(p, insn);
1383 return insn;
1384 }
1385
1386 /**
1387 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1388 */
1389 static void
1390 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1391 brw_inst *if_inst, brw_inst *else_inst)
1392 {
1393 const struct gen_device_info *devinfo = p->devinfo;
1394
1395 /* The next instruction (where the ENDIF would be, if it existed) */
1396 brw_inst *next_inst = &p->store[p->nr_insn];
1397
1398 assert(p->single_program_flow);
1399 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1400 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1401 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1402
1403 /* Convert IF to an ADD instruction that moves the instruction pointer
1404 * to the first instruction of the ELSE block. If there is no ELSE
1405 * block, point to where ENDIF would be. Reverse the predicate.
1406 *
1407 * There's no need to execute an ENDIF since we don't need to do any
1408 * stack operations, and if we're currently executing, we just want to
1409 * continue normally.
1410 */
1411 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1412 brw_inst_set_pred_inv(devinfo, if_inst, true);
1413
1414 if (else_inst != NULL) {
1415 /* Convert ELSE to an ADD instruction that points where the ENDIF
1416 * would be.
1417 */
1418 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1419
1420 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1421 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1422 } else {
1423 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1424 }
1425 }
1426
1427 /**
1428 * Patch IF and ELSE instructions with appropriate jump targets.
1429 */
1430 static void
1431 patch_IF_ELSE(struct brw_codegen *p,
1432 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1433 {
1434 const struct gen_device_info *devinfo = p->devinfo;
1435
1436 /* We shouldn't be patching IF and ELSE instructions in single program flow
1437 * mode when gen < 6, because in single program flow mode on those
1438 * platforms, we convert flow control instructions to conditional ADDs that
1439 * operate on IP (see brw_ENDIF).
1440 *
1441 * However, on Gen6, writing to IP doesn't work in single program flow mode
1442 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1443 * not be updated by non-flow control instructions."). And on later
1444 * platforms, there is no significant benefit to converting control flow
1445 * instructions to conditional ADDs. So we do patch IF and ELSE
1446 * instructions in single program flow mode on those platforms.
1447 */
1448 if (devinfo->gen < 6)
1449 assert(!p->single_program_flow);
1450
1451 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1452 assert(endif_inst != NULL);
1453 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1454
1455 unsigned br = brw_jump_scale(devinfo);
1456
1457 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1458 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1459
1460 if (else_inst == NULL) {
1461 /* Patch IF -> ENDIF */
1462 if (devinfo->gen < 6) {
1463 /* Turn it into an IFF, which means no mask stack operations for
1464 * all-false and jumping past the ENDIF.
1465 */
1466 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1467 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1468 br * (endif_inst - if_inst + 1));
1469 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1470 } else if (devinfo->gen == 6) {
1471 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1472 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1473 } else {
1474 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1475 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1476 }
1477 } else {
1478 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1479
1480 /* Patch IF -> ELSE */
1481 if (devinfo->gen < 6) {
1482 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1483 br * (else_inst - if_inst));
1484 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1485 } else if (devinfo->gen == 6) {
1486 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1487 br * (else_inst - if_inst + 1));
1488 }
1489
1490 /* Patch ELSE -> ENDIF */
1491 if (devinfo->gen < 6) {
1492 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1493 * matching ENDIF.
1494 */
1495 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1496 br * (endif_inst - else_inst + 1));
1497 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1498 } else if (devinfo->gen == 6) {
1499 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1500 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1501 br * (endif_inst - else_inst));
1502 } else {
1503 /* The IF instruction's JIP should point just past the ELSE */
1504 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1505 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1506 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1507 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1508 if (devinfo->gen >= 8) {
1509 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1510 * should point to ENDIF.
1511 */
1512 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1513 }
1514 }
1515 }
1516 }
1517
1518 void
1519 brw_ELSE(struct brw_codegen *p)
1520 {
1521 const struct gen_device_info *devinfo = p->devinfo;
1522 brw_inst *insn;
1523
1524 insn = next_insn(p, BRW_OPCODE_ELSE);
1525
1526 if (devinfo->gen < 6) {
1527 brw_set_dest(p, insn, brw_ip_reg());
1528 brw_set_src0(p, insn, brw_ip_reg());
1529 brw_set_src1(p, insn, brw_imm_d(0x0));
1530 } else if (devinfo->gen == 6) {
1531 brw_set_dest(p, insn, brw_imm_w(0));
1532 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1533 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1534 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1535 } else if (devinfo->gen == 7) {
1536 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1537 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1538 brw_set_src1(p, insn, brw_imm_w(0));
1539 brw_inst_set_jip(devinfo, insn, 0);
1540 brw_inst_set_uip(devinfo, insn, 0);
1541 } else {
1542 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1543 brw_set_src0(p, insn, brw_imm_d(0));
1544 brw_inst_set_jip(devinfo, insn, 0);
1545 brw_inst_set_uip(devinfo, insn, 0);
1546 }
1547
1548 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1549 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1550 if (!p->single_program_flow && devinfo->gen < 6)
1551 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1552
1553 push_if_stack(p, insn);
1554 }
1555
1556 void
1557 brw_ENDIF(struct brw_codegen *p)
1558 {
1559 const struct gen_device_info *devinfo = p->devinfo;
1560 brw_inst *insn = NULL;
1561 brw_inst *else_inst = NULL;
1562 brw_inst *if_inst = NULL;
1563 brw_inst *tmp;
1564 bool emit_endif = true;
1565
1566 /* In single program flow mode, we can express IF and ELSE instructions
1567 * equivalently as ADD instructions that operate on IP. On platforms prior
1568 * to Gen6, flow control instructions cause an implied thread switch, so
1569 * this is a significant savings.
1570 *
1571 * However, on Gen6, writing to IP doesn't work in single program flow mode
1572 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1573 * not be updated by non-flow control instructions."). And on later
1574 * platforms, there is no significant benefit to converting control flow
1575 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1576 * Gen5.
1577 */
1578 if (devinfo->gen < 6 && p->single_program_flow)
1579 emit_endif = false;
1580
1581 /*
1582 * A single next_insn() may change the base address of instruction store
1583 * memory(p->store), so call it first before referencing the instruction
1584 * store pointer from an index
1585 */
1586 if (emit_endif)
1587 insn = next_insn(p, BRW_OPCODE_ENDIF);
1588
1589 /* Pop the IF and (optional) ELSE instructions from the stack */
1590 p->if_depth_in_loop[p->loop_stack_depth]--;
1591 tmp = pop_if_stack(p);
1592 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1593 else_inst = tmp;
1594 tmp = pop_if_stack(p);
1595 }
1596 if_inst = tmp;
1597
1598 if (!emit_endif) {
1599 /* ENDIF is useless; don't bother emitting it. */
1600 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1601 return;
1602 }
1603
1604 if (devinfo->gen < 6) {
1605 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1606 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1607 brw_set_src1(p, insn, brw_imm_d(0x0));
1608 } else if (devinfo->gen == 6) {
1609 brw_set_dest(p, insn, brw_imm_w(0));
1610 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1611 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612 } else if (devinfo->gen == 7) {
1613 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1614 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1615 brw_set_src1(p, insn, brw_imm_w(0));
1616 } else {
1617 brw_set_src0(p, insn, brw_imm_d(0));
1618 }
1619
1620 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1621 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1622 if (devinfo->gen < 6)
1623 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1624
1625 /* Also pop item off the stack in the endif instruction: */
1626 if (devinfo->gen < 6) {
1627 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1628 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1629 } else if (devinfo->gen == 6) {
1630 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1631 } else {
1632 brw_inst_set_jip(devinfo, insn, 2);
1633 }
1634 patch_IF_ELSE(p, if_inst, else_inst, insn);
1635 }
1636
1637 brw_inst *
1638 brw_BREAK(struct brw_codegen *p)
1639 {
1640 const struct gen_device_info *devinfo = p->devinfo;
1641 brw_inst *insn;
1642
1643 insn = next_insn(p, BRW_OPCODE_BREAK);
1644 if (devinfo->gen >= 8) {
1645 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1646 brw_set_src0(p, insn, brw_imm_d(0x0));
1647 } else if (devinfo->gen >= 6) {
1648 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1649 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1650 brw_set_src1(p, insn, brw_imm_d(0x0));
1651 } else {
1652 brw_set_dest(p, insn, brw_ip_reg());
1653 brw_set_src0(p, insn, brw_ip_reg());
1654 brw_set_src1(p, insn, brw_imm_d(0x0));
1655 brw_inst_set_gen4_pop_count(devinfo, insn,
1656 p->if_depth_in_loop[p->loop_stack_depth]);
1657 }
1658 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1659 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1660
1661 return insn;
1662 }
1663
1664 brw_inst *
1665 brw_CONT(struct brw_codegen *p)
1666 {
1667 const struct gen_device_info *devinfo = p->devinfo;
1668 brw_inst *insn;
1669
1670 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1671 brw_set_dest(p, insn, brw_ip_reg());
1672 if (devinfo->gen >= 8) {
1673 brw_set_src0(p, insn, brw_imm_d(0x0));
1674 } else {
1675 brw_set_src0(p, insn, brw_ip_reg());
1676 brw_set_src1(p, insn, brw_imm_d(0x0));
1677 }
1678
1679 if (devinfo->gen < 6) {
1680 brw_inst_set_gen4_pop_count(devinfo, insn,
1681 p->if_depth_in_loop[p->loop_stack_depth]);
1682 }
1683 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1684 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1685 return insn;
1686 }
1687
1688 brw_inst *
1689 gen6_HALT(struct brw_codegen *p)
1690 {
1691 const struct gen_device_info *devinfo = p->devinfo;
1692 brw_inst *insn;
1693
1694 insn = next_insn(p, BRW_OPCODE_HALT);
1695 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1696 if (devinfo->gen >= 8) {
1697 brw_set_src0(p, insn, brw_imm_d(0x0));
1698 } else {
1699 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1700 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1701 }
1702
1703 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1704 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1705 return insn;
1706 }
1707
1708 /* DO/WHILE loop:
1709 *
1710 * The DO/WHILE is just an unterminated loop -- break or continue are
1711 * used for control within the loop. We have a few ways they can be
1712 * done.
1713 *
1714 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1715 * jip and no DO instruction.
1716 *
1717 * For non-uniform control flow pre-gen6, there's a DO instruction to
1718 * push the mask, and a WHILE to jump back, and BREAK to get out and
1719 * pop the mask.
1720 *
1721 * For gen6, there's no more mask stack, so no need for DO. WHILE
1722 * just points back to the first instruction of the loop.
1723 */
1724 brw_inst *
1725 brw_DO(struct brw_codegen *p, unsigned execute_size)
1726 {
1727 const struct gen_device_info *devinfo = p->devinfo;
1728
1729 if (devinfo->gen >= 6 || p->single_program_flow) {
1730 push_loop_stack(p, &p->store[p->nr_insn]);
1731 return &p->store[p->nr_insn];
1732 } else {
1733 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1734
1735 push_loop_stack(p, insn);
1736
1737 /* Override the defaults for this instruction:
1738 */
1739 brw_set_dest(p, insn, brw_null_reg());
1740 brw_set_src0(p, insn, brw_null_reg());
1741 brw_set_src1(p, insn, brw_null_reg());
1742
1743 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1744 brw_inst_set_exec_size(devinfo, insn, execute_size);
1745 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1746
1747 return insn;
1748 }
1749 }
1750
1751 /**
1752 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1753 * instruction here.
1754 *
1755 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1756 * nesting, since it can always just point to the end of the block/current loop.
1757 */
1758 static void
1759 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1760 {
1761 const struct gen_device_info *devinfo = p->devinfo;
1762 brw_inst *do_inst = get_inner_do_insn(p);
1763 brw_inst *inst;
1764 unsigned br = brw_jump_scale(devinfo);
1765
1766 assert(devinfo->gen < 6);
1767
1768 for (inst = while_inst - 1; inst != do_inst; inst--) {
1769 /* If the jump count is != 0, that means that this instruction has already
1770 * been patched because it's part of a loop inside of the one we're
1771 * patching.
1772 */
1773 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1774 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1775 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1776 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1777 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1778 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1779 }
1780 }
1781 }
1782
1783 brw_inst *
1784 brw_WHILE(struct brw_codegen *p)
1785 {
1786 const struct gen_device_info *devinfo = p->devinfo;
1787 brw_inst *insn, *do_insn;
1788 unsigned br = brw_jump_scale(devinfo);
1789
1790 if (devinfo->gen >= 6) {
1791 insn = next_insn(p, BRW_OPCODE_WHILE);
1792 do_insn = get_inner_do_insn(p);
1793
1794 if (devinfo->gen >= 8) {
1795 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1796 brw_set_src0(p, insn, brw_imm_d(0));
1797 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1798 } else if (devinfo->gen == 7) {
1799 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1800 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1801 brw_set_src1(p, insn, brw_imm_w(0));
1802 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1803 } else {
1804 brw_set_dest(p, insn, brw_imm_w(0));
1805 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1806 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1807 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1808 }
1809
1810 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1811
1812 } else {
1813 if (p->single_program_flow) {
1814 insn = next_insn(p, BRW_OPCODE_ADD);
1815 do_insn = get_inner_do_insn(p);
1816
1817 brw_set_dest(p, insn, brw_ip_reg());
1818 brw_set_src0(p, insn, brw_ip_reg());
1819 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1820 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1821 } else {
1822 insn = next_insn(p, BRW_OPCODE_WHILE);
1823 do_insn = get_inner_do_insn(p);
1824
1825 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1826
1827 brw_set_dest(p, insn, brw_ip_reg());
1828 brw_set_src0(p, insn, brw_ip_reg());
1829 brw_set_src1(p, insn, brw_imm_d(0));
1830
1831 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1832 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1833 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1834
1835 brw_patch_break_cont(p, insn);
1836 }
1837 }
1838 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1839
1840 p->loop_stack_depth--;
1841
1842 return insn;
1843 }
1844
1845 /* FORWARD JUMPS:
1846 */
1847 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1848 {
1849 const struct gen_device_info *devinfo = p->devinfo;
1850 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1851 unsigned jmpi = 1;
1852
1853 if (devinfo->gen >= 5)
1854 jmpi = 2;
1855
1856 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1857 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1858
1859 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1860 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1861 }
1862
1863 /* To integrate with the above, it makes sense that the comparison
1864 * instruction should populate the flag register. It might be simpler
1865 * just to use the flag reg for most WM tasks?
1866 */
1867 void brw_CMP(struct brw_codegen *p,
1868 struct brw_reg dest,
1869 unsigned conditional,
1870 struct brw_reg src0,
1871 struct brw_reg src1)
1872 {
1873 const struct gen_device_info *devinfo = p->devinfo;
1874 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1875
1876 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1877 brw_set_dest(p, insn, dest);
1878 brw_set_src0(p, insn, src0);
1879 brw_set_src1(p, insn, src1);
1880
1881 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1882 * page says:
1883 * "Any CMP instruction with a null destination must use a {switch}."
1884 *
1885 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1886 * mentioned on their work-arounds pages.
1887 */
1888 if (devinfo->gen == 7) {
1889 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1890 dest.nr == BRW_ARF_NULL) {
1891 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1892 }
1893 }
1894 }
1895
1896 /***********************************************************************
1897 * Helpers for the various SEND message types:
1898 */
1899
1900 /** Extended math function, float[8].
1901 */
1902 void gen4_math(struct brw_codegen *p,
1903 struct brw_reg dest,
1904 unsigned function,
1905 unsigned msg_reg_nr,
1906 struct brw_reg src,
1907 unsigned precision )
1908 {
1909 const struct gen_device_info *devinfo = p->devinfo;
1910 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1911 unsigned data_type;
1912 if (has_scalar_region(src)) {
1913 data_type = BRW_MATH_DATA_SCALAR;
1914 } else {
1915 data_type = BRW_MATH_DATA_VECTOR;
1916 }
1917
1918 assert(devinfo->gen < 6);
1919
1920 /* Example code doesn't set predicate_control for send
1921 * instructions.
1922 */
1923 brw_inst_set_pred_control(devinfo, insn, 0);
1924 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1925
1926 brw_set_dest(p, insn, dest);
1927 brw_set_src0(p, insn, src);
1928 brw_set_math_message(p,
1929 insn,
1930 function,
1931 src.type == BRW_REGISTER_TYPE_D,
1932 precision,
1933 data_type);
1934 }
1935
1936 void gen6_math(struct brw_codegen *p,
1937 struct brw_reg dest,
1938 unsigned function,
1939 struct brw_reg src0,
1940 struct brw_reg src1)
1941 {
1942 const struct gen_device_info *devinfo = p->devinfo;
1943 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1944
1945 assert(devinfo->gen >= 6);
1946
1947 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1948 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1949
1950 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1951 if (devinfo->gen == 6) {
1952 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1953 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1954 }
1955
1956 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1957 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1958 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1959 assert(src0.type != BRW_REGISTER_TYPE_F);
1960 assert(src1.type != BRW_REGISTER_TYPE_F);
1961 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1962 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1963 } else {
1964 assert(src0.type == BRW_REGISTER_TYPE_F);
1965 assert(src1.type == BRW_REGISTER_TYPE_F);
1966 }
1967
1968 /* Source modifiers are ignored for extended math instructions on Gen6. */
1969 if (devinfo->gen == 6) {
1970 assert(!src0.negate);
1971 assert(!src0.abs);
1972 assert(!src1.negate);
1973 assert(!src1.abs);
1974 }
1975
1976 brw_inst_set_math_function(devinfo, insn, function);
1977
1978 brw_set_dest(p, insn, dest);
1979 brw_set_src0(p, insn, src0);
1980 brw_set_src1(p, insn, src1);
1981 }
1982
1983 /**
1984 * Return the right surface index to access the thread scratch space using
1985 * stateless dataport messages.
1986 */
1987 unsigned
1988 brw_scratch_surface_idx(const struct brw_codegen *p)
1989 {
1990 /* The scratch space is thread-local so IA coherency is unnecessary. */
1991 if (p->devinfo->gen >= 8)
1992 return GEN8_BTI_STATELESS_NON_COHERENT;
1993 else
1994 return BRW_BTI_STATELESS;
1995 }
1996
1997 /**
1998 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1999 * using a constant offset per channel.
2000 *
2001 * The offset must be aligned to oword size (16 bytes). Used for
2002 * register spilling.
2003 */
2004 void brw_oword_block_write_scratch(struct brw_codegen *p,
2005 struct brw_reg mrf,
2006 int num_regs,
2007 unsigned offset)
2008 {
2009 const struct gen_device_info *devinfo = p->devinfo;
2010 const unsigned target_cache =
2011 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2012 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2013 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2014 uint32_t msg_type;
2015
2016 if (devinfo->gen >= 6)
2017 offset /= 16;
2018
2019 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2020
2021 const unsigned mlen = 1 + num_regs;
2022
2023 /* Set up the message header. This is g0, with g0.2 filled with
2024 * the offset. We don't want to leave our offset around in g0 or
2025 * it'll screw up texture samples, so set it up inside the message
2026 * reg.
2027 */
2028 {
2029 brw_push_insn_state(p);
2030 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2031 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2032 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2033
2034 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2035
2036 /* set message header global offset field (reg 0, element 2) */
2037 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2038 brw_MOV(p,
2039 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2040 mrf.nr,
2041 2), BRW_REGISTER_TYPE_UD),
2042 brw_imm_ud(offset));
2043
2044 brw_pop_insn_state(p);
2045 }
2046
2047 {
2048 struct brw_reg dest;
2049 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2050 int send_commit_msg;
2051 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2052 BRW_REGISTER_TYPE_UW);
2053
2054 brw_inst_set_compression(devinfo, insn, false);
2055
2056 if (brw_inst_exec_size(devinfo, insn) >= 16)
2057 src_header = vec16(src_header);
2058
2059 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2060 if (devinfo->gen < 6)
2061 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2062
2063 /* Until gen6, writes followed by reads from the same location
2064 * are not guaranteed to be ordered unless write_commit is set.
2065 * If set, then a no-op write is issued to the destination
2066 * register to set a dependency, and a read from the destination
2067 * can be used to ensure the ordering.
2068 *
2069 * For gen6, only writes between different threads need ordering
2070 * protection. Our use of DP writes is all about register
2071 * spilling within a thread.
2072 */
2073 if (devinfo->gen >= 6) {
2074 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2075 send_commit_msg = 0;
2076 } else {
2077 dest = src_header;
2078 send_commit_msg = 1;
2079 }
2080
2081 brw_set_dest(p, insn, dest);
2082 if (devinfo->gen >= 6) {
2083 brw_set_src0(p, insn, mrf);
2084 } else {
2085 brw_set_src0(p, insn, brw_null_reg());
2086 }
2087
2088 if (devinfo->gen >= 6)
2089 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2090 else
2091 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2092
2093 brw_set_dp_write_message(p,
2094 insn,
2095 brw_scratch_surface_idx(p),
2096 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2097 msg_type,
2098 target_cache,
2099 mlen,
2100 true, /* header_present */
2101 0, /* not a render target */
2102 send_commit_msg, /* response_length */
2103 0, /* eot */
2104 send_commit_msg);
2105 }
2106 }
2107
2108
2109 /**
2110 * Read a block of owords (half a GRF each) from the scratch buffer
2111 * using a constant index per channel.
2112 *
2113 * Offset must be aligned to oword size (16 bytes). Used for register
2114 * spilling.
2115 */
2116 void
2117 brw_oword_block_read_scratch(struct brw_codegen *p,
2118 struct brw_reg dest,
2119 struct brw_reg mrf,
2120 int num_regs,
2121 unsigned offset)
2122 {
2123 const struct gen_device_info *devinfo = p->devinfo;
2124
2125 if (devinfo->gen >= 6)
2126 offset /= 16;
2127
2128 if (p->devinfo->gen >= 7) {
2129 /* On gen 7 and above, we no longer have message registers and we can
2130 * send from any register we want. By using the destination register
2131 * for the message, we guarantee that the implied message write won't
2132 * accidentally overwrite anything. This has been a problem because
2133 * the MRF registers and source for the final FB write are both fixed
2134 * and may overlap.
2135 */
2136 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2137 } else {
2138 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2139 }
2140 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2141
2142 const unsigned rlen = num_regs;
2143 const unsigned target_cache =
2144 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2145 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2146 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2147
2148 {
2149 brw_push_insn_state(p);
2150 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2151 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2152 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2153
2154 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2155
2156 /* set message header global offset field (reg 0, element 2) */
2157 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2158 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2159
2160 brw_pop_insn_state(p);
2161 }
2162
2163 {
2164 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2165
2166 assert(brw_inst_pred_control(devinfo, insn) == 0);
2167 brw_inst_set_compression(devinfo, insn, false);
2168
2169 brw_set_dest(p, insn, dest); /* UW? */
2170 if (devinfo->gen >= 6) {
2171 brw_set_src0(p, insn, mrf);
2172 } else {
2173 brw_set_src0(p, insn, brw_null_reg());
2174 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2175 }
2176
2177 brw_set_dp_read_message(p,
2178 insn,
2179 brw_scratch_surface_idx(p),
2180 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2181 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2182 target_cache,
2183 1, /* msg_length */
2184 true, /* header_present */
2185 rlen);
2186 }
2187 }
2188
2189 void
2190 gen7_block_read_scratch(struct brw_codegen *p,
2191 struct brw_reg dest,
2192 int num_regs,
2193 unsigned offset)
2194 {
2195 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2196 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2197
2198 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2199
2200 /* The HW requires that the header is present; this is to get the g0.5
2201 * scratch offset.
2202 */
2203 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2204
2205 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2206 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2207 * is 32 bytes, which happens to be the size of a register.
2208 */
2209 offset /= REG_SIZE;
2210 assert(offset < (1 << 12));
2211
2212 gen7_set_dp_scratch_message(p, insn,
2213 false, /* scratch read */
2214 false, /* OWords */
2215 false, /* invalidate after read */
2216 num_regs,
2217 offset,
2218 1, /* mlen: just g0 */
2219 num_regs, /* rlen */
2220 true); /* header present */
2221 }
2222
2223 /**
2224 * Read float[4] vectors from the data port constant cache.
2225 * Location (in buffer) should be a multiple of 16.
2226 * Used for fetching shader constants.
2227 */
2228 void brw_oword_block_read(struct brw_codegen *p,
2229 struct brw_reg dest,
2230 struct brw_reg mrf,
2231 uint32_t offset,
2232 uint32_t bind_table_index)
2233 {
2234 const struct gen_device_info *devinfo = p->devinfo;
2235 const unsigned target_cache =
2236 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2237 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2238 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2239
2240 /* On newer hardware, offset is in units of owords. */
2241 if (devinfo->gen >= 6)
2242 offset /= 16;
2243
2244 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2245
2246 brw_push_insn_state(p);
2247 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2248 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2249 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2250
2251 brw_push_insn_state(p);
2252 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2253 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2254
2255 /* set message header global offset field (reg 0, element 2) */
2256 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2257 brw_MOV(p,
2258 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2259 mrf.nr,
2260 2), BRW_REGISTER_TYPE_UD),
2261 brw_imm_ud(offset));
2262 brw_pop_insn_state(p);
2263
2264 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2265
2266 /* cast dest to a uword[8] vector */
2267 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2268
2269 brw_set_dest(p, insn, dest);
2270 if (devinfo->gen >= 6) {
2271 brw_set_src0(p, insn, mrf);
2272 } else {
2273 brw_set_src0(p, insn, brw_null_reg());
2274 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2275 }
2276
2277 brw_set_dp_read_message(p, insn, bind_table_index,
2278 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2279 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2280 target_cache,
2281 1, /* msg_length */
2282 true, /* header_present */
2283 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2284
2285 brw_pop_insn_state(p);
2286 }
2287
2288
2289 void brw_fb_WRITE(struct brw_codegen *p,
2290 struct brw_reg payload,
2291 struct brw_reg implied_header,
2292 unsigned msg_control,
2293 unsigned binding_table_index,
2294 unsigned msg_length,
2295 unsigned response_length,
2296 bool eot,
2297 bool last_render_target,
2298 bool header_present)
2299 {
2300 const struct gen_device_info *devinfo = p->devinfo;
2301 const unsigned target_cache =
2302 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2303 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2304 brw_inst *insn;
2305 unsigned msg_type;
2306 struct brw_reg dest, src0;
2307
2308 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2309 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2310 else
2311 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2312
2313 if (devinfo->gen >= 6) {
2314 insn = next_insn(p, BRW_OPCODE_SENDC);
2315 } else {
2316 insn = next_insn(p, BRW_OPCODE_SEND);
2317 }
2318 brw_inst_set_compression(devinfo, insn, false);
2319
2320 if (devinfo->gen >= 6) {
2321 /* headerless version, just submit color payload */
2322 src0 = payload;
2323
2324 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2325 } else {
2326 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2327 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2328 src0 = implied_header;
2329
2330 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2331 }
2332
2333 brw_set_dest(p, insn, dest);
2334 brw_set_src0(p, insn, src0);
2335 brw_set_dp_write_message(p,
2336 insn,
2337 binding_table_index,
2338 msg_control,
2339 msg_type,
2340 target_cache,
2341 msg_length,
2342 header_present,
2343 last_render_target,
2344 response_length,
2345 eot,
2346 0 /* send_commit_msg */);
2347 }
2348
2349 brw_inst *
2350 gen9_fb_READ(struct brw_codegen *p,
2351 struct brw_reg dst,
2352 struct brw_reg payload,
2353 unsigned binding_table_index,
2354 unsigned msg_length,
2355 unsigned response_length,
2356 bool per_sample)
2357 {
2358 const struct gen_device_info *devinfo = p->devinfo;
2359 assert(devinfo->gen >= 9);
2360 const unsigned msg_subtype =
2361 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2362 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2363
2364 brw_set_dest(p, insn, dst);
2365 brw_set_src0(p, insn, payload);
2366 brw_set_dp_read_message(p, insn, binding_table_index,
2367 per_sample << 5 | msg_subtype,
2368 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2369 GEN6_SFID_DATAPORT_RENDER_CACHE,
2370 msg_length, true /* header_present */,
2371 response_length);
2372 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2373
2374 return insn;
2375 }
2376
2377 /**
2378 * Texture sample instruction.
2379 * Note: the msg_type plus msg_length values determine exactly what kind
2380 * of sampling operation is performed. See volume 4, page 161 of docs.
2381 */
2382 void brw_SAMPLE(struct brw_codegen *p,
2383 struct brw_reg dest,
2384 unsigned msg_reg_nr,
2385 struct brw_reg src0,
2386 unsigned binding_table_index,
2387 unsigned sampler,
2388 unsigned msg_type,
2389 unsigned response_length,
2390 unsigned msg_length,
2391 unsigned header_present,
2392 unsigned simd_mode,
2393 unsigned return_format)
2394 {
2395 const struct gen_device_info *devinfo = p->devinfo;
2396 brw_inst *insn;
2397
2398 if (msg_reg_nr != -1)
2399 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2400
2401 insn = next_insn(p, BRW_OPCODE_SEND);
2402 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2403
2404 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2405 *
2406 * "Instruction compression is not allowed for this instruction (that
2407 * is, send). The hardware behavior is undefined if this instruction is
2408 * set as compressed. However, compress control can be set to "SecHalf"
2409 * to affect the EMask generation."
2410 *
2411 * No similar wording is found in later PRMs, but there are examples
2412 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2413 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2414 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2415 */
2416 brw_inst_set_compression(devinfo, insn, false);
2417
2418 if (devinfo->gen < 6)
2419 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2420
2421 brw_set_dest(p, insn, dest);
2422 brw_set_src0(p, insn, src0);
2423 brw_set_sampler_message(p, insn,
2424 binding_table_index,
2425 sampler,
2426 msg_type,
2427 response_length,
2428 msg_length,
2429 header_present,
2430 simd_mode,
2431 return_format);
2432 }
2433
2434 /* Adjust the message header's sampler state pointer to
2435 * select the correct group of 16 samplers.
2436 */
2437 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2438 struct brw_reg header,
2439 struct brw_reg sampler_index)
2440 {
2441 /* The "Sampler Index" field can only store values between 0 and 15.
2442 * However, we can add an offset to the "Sampler State Pointer"
2443 * field, effectively selecting a different set of 16 samplers.
2444 *
2445 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2446 * offset, and each sampler state is only 16-bytes, so we can't
2447 * exclusively use the offset - we have to use both.
2448 */
2449
2450 const struct gen_device_info *devinfo = p->devinfo;
2451
2452 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2453 const int sampler_state_size = 16; /* 16 bytes */
2454 uint32_t sampler = sampler_index.ud;
2455
2456 if (sampler >= 16) {
2457 assert(devinfo->is_haswell || devinfo->gen >= 8);
2458 brw_ADD(p,
2459 get_element_ud(header, 3),
2460 get_element_ud(brw_vec8_grf(0, 0), 3),
2461 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2462 }
2463 } else {
2464 /* Non-const sampler array indexing case */
2465 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2466 return;
2467 }
2468
2469 struct brw_reg temp = get_element_ud(header, 3);
2470
2471 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2472 brw_SHL(p, temp, temp, brw_imm_ud(4));
2473 brw_ADD(p,
2474 get_element_ud(header, 3),
2475 get_element_ud(brw_vec8_grf(0, 0), 3),
2476 temp);
2477 }
2478 }
2479
2480 /* All these variables are pretty confusing - we might be better off
2481 * using bitmasks and macros for this, in the old style. Or perhaps
2482 * just having the caller instantiate the fields in dword3 itself.
2483 */
2484 void brw_urb_WRITE(struct brw_codegen *p,
2485 struct brw_reg dest,
2486 unsigned msg_reg_nr,
2487 struct brw_reg src0,
2488 enum brw_urb_write_flags flags,
2489 unsigned msg_length,
2490 unsigned response_length,
2491 unsigned offset,
2492 unsigned swizzle)
2493 {
2494 const struct gen_device_info *devinfo = p->devinfo;
2495 brw_inst *insn;
2496
2497 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2498
2499 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2500 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2501 brw_push_insn_state(p);
2502 brw_set_default_access_mode(p, BRW_ALIGN_1);
2503 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2504 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2505 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2506 BRW_REGISTER_TYPE_UD),
2507 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2508 brw_imm_ud(0xff00));
2509 brw_pop_insn_state(p);
2510 }
2511
2512 insn = next_insn(p, BRW_OPCODE_SEND);
2513
2514 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2515
2516 brw_set_dest(p, insn, dest);
2517 brw_set_src0(p, insn, src0);
2518 brw_set_src1(p, insn, brw_imm_d(0));
2519
2520 if (devinfo->gen < 6)
2521 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2522
2523 brw_set_urb_message(p,
2524 insn,
2525 flags,
2526 msg_length,
2527 response_length,
2528 offset,
2529 swizzle);
2530 }
2531
2532 struct brw_inst *
2533 brw_send_indirect_message(struct brw_codegen *p,
2534 unsigned sfid,
2535 struct brw_reg dst,
2536 struct brw_reg payload,
2537 struct brw_reg desc)
2538 {
2539 const struct gen_device_info *devinfo = p->devinfo;
2540 struct brw_inst *send;
2541 int setup;
2542
2543 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2544
2545 assert(desc.type == BRW_REGISTER_TYPE_UD);
2546
2547 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2548 * in the indirect case) by its index in the instruction store. The
2549 * pointer returned by next_insn() may become invalid if emitting the SEND
2550 * in the indirect case reallocs the store.
2551 */
2552
2553 if (desc.file == BRW_IMMEDIATE_VALUE) {
2554 setup = p->nr_insn;
2555 send = next_insn(p, BRW_OPCODE_SEND);
2556 brw_set_src1(p, send, desc);
2557
2558 } else {
2559 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2560
2561 brw_push_insn_state(p);
2562 brw_set_default_access_mode(p, BRW_ALIGN_1);
2563 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2564 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2565 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2566
2567 /* Load the indirect descriptor to an address register using OR so the
2568 * caller can specify additional descriptor bits with the usual
2569 * brw_set_*_message() helper functions.
2570 */
2571 setup = p->nr_insn;
2572 brw_OR(p, addr, desc, brw_imm_ud(0));
2573
2574 brw_pop_insn_state(p);
2575
2576 send = next_insn(p, BRW_OPCODE_SEND);
2577 brw_set_src1(p, send, addr);
2578 }
2579
2580 if (dst.width < BRW_EXECUTE_8)
2581 brw_inst_set_exec_size(devinfo, send, dst.width);
2582
2583 brw_set_dest(p, send, dst);
2584 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2585 brw_inst_set_sfid(devinfo, send, sfid);
2586
2587 return &p->store[setup];
2588 }
2589
2590 static struct brw_inst *
2591 brw_send_indirect_surface_message(struct brw_codegen *p,
2592 unsigned sfid,
2593 struct brw_reg dst,
2594 struct brw_reg payload,
2595 struct brw_reg surface,
2596 unsigned message_len,
2597 unsigned response_len,
2598 bool header_present)
2599 {
2600 const struct gen_device_info *devinfo = p->devinfo;
2601 struct brw_inst *insn;
2602
2603 if (surface.file != BRW_IMMEDIATE_VALUE) {
2604 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2605
2606 brw_push_insn_state(p);
2607 brw_set_default_access_mode(p, BRW_ALIGN_1);
2608 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2609 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2610 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2611
2612 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2613 * some surface array is accessed out of bounds.
2614 */
2615 insn = brw_AND(p, addr,
2616 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2617 BRW_GET_SWZ(surface.swizzle, 0)),
2618 brw_imm_ud(0xff));
2619
2620 brw_pop_insn_state(p);
2621
2622 surface = addr;
2623 }
2624
2625 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2626 brw_inst_set_mlen(devinfo, insn, message_len);
2627 brw_inst_set_rlen(devinfo, insn, response_len);
2628 brw_inst_set_header_present(devinfo, insn, header_present);
2629
2630 return insn;
2631 }
2632
2633 static bool
2634 while_jumps_before_offset(const struct gen_device_info *devinfo,
2635 brw_inst *insn, int while_offset, int start_offset)
2636 {
2637 int scale = 16 / brw_jump_scale(devinfo);
2638 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2639 : brw_inst_jip(devinfo, insn);
2640 assert(jip < 0);
2641 return while_offset + jip * scale <= start_offset;
2642 }
2643
2644
2645 static int
2646 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2647 {
2648 int offset;
2649 void *store = p->store;
2650 const struct gen_device_info *devinfo = p->devinfo;
2651
2652 int depth = 0;
2653
2654 for (offset = next_offset(devinfo, store, start_offset);
2655 offset < p->next_insn_offset;
2656 offset = next_offset(devinfo, store, offset)) {
2657 brw_inst *insn = store + offset;
2658
2659 switch (brw_inst_opcode(devinfo, insn)) {
2660 case BRW_OPCODE_IF:
2661 depth++;
2662 break;
2663 case BRW_OPCODE_ENDIF:
2664 if (depth == 0)
2665 return offset;
2666 depth--;
2667 break;
2668 case BRW_OPCODE_WHILE:
2669 /* If the while doesn't jump before our instruction, it's the end
2670 * of a sibling do...while loop. Ignore it.
2671 */
2672 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2673 continue;
2674 /* fallthrough */
2675 case BRW_OPCODE_ELSE:
2676 case BRW_OPCODE_HALT:
2677 if (depth == 0)
2678 return offset;
2679 }
2680 }
2681
2682 return 0;
2683 }
2684
2685 /* There is no DO instruction on gen6, so to find the end of the loop
2686 * we have to see if the loop is jumping back before our start
2687 * instruction.
2688 */
2689 static int
2690 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2691 {
2692 const struct gen_device_info *devinfo = p->devinfo;
2693 int offset;
2694 void *store = p->store;
2695
2696 assert(devinfo->gen >= 6);
2697
2698 /* Always start after the instruction (such as a WHILE) we're trying to fix
2699 * up.
2700 */
2701 for (offset = next_offset(devinfo, store, start_offset);
2702 offset < p->next_insn_offset;
2703 offset = next_offset(devinfo, store, offset)) {
2704 brw_inst *insn = store + offset;
2705
2706 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2707 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2708 return offset;
2709 }
2710 }
2711 assert(!"not reached");
2712 return start_offset;
2713 }
2714
2715 /* After program generation, go back and update the UIP and JIP of
2716 * BREAK, CONT, and HALT instructions to their correct locations.
2717 */
2718 void
2719 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2720 {
2721 const struct gen_device_info *devinfo = p->devinfo;
2722 int offset;
2723 int br = brw_jump_scale(devinfo);
2724 int scale = 16 / br;
2725 void *store = p->store;
2726
2727 if (devinfo->gen < 6)
2728 return;
2729
2730 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2731 brw_inst *insn = store + offset;
2732 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2733
2734 int block_end_offset = brw_find_next_block_end(p, offset);
2735 switch (brw_inst_opcode(devinfo, insn)) {
2736 case BRW_OPCODE_BREAK:
2737 assert(block_end_offset != 0);
2738 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2739 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2740 brw_inst_set_uip(devinfo, insn,
2741 (brw_find_loop_end(p, offset) - offset +
2742 (devinfo->gen == 6 ? 16 : 0)) / scale);
2743 break;
2744 case BRW_OPCODE_CONTINUE:
2745 assert(block_end_offset != 0);
2746 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2747 brw_inst_set_uip(devinfo, insn,
2748 (brw_find_loop_end(p, offset) - offset) / scale);
2749
2750 assert(brw_inst_uip(devinfo, insn) != 0);
2751 assert(brw_inst_jip(devinfo, insn) != 0);
2752 break;
2753
2754 case BRW_OPCODE_ENDIF: {
2755 int32_t jump = (block_end_offset == 0) ?
2756 1 * br : (block_end_offset - offset) / scale;
2757 if (devinfo->gen >= 7)
2758 brw_inst_set_jip(devinfo, insn, jump);
2759 else
2760 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2761 break;
2762 }
2763
2764 case BRW_OPCODE_HALT:
2765 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2766 *
2767 * "In case of the halt instruction not inside any conditional
2768 * code block, the value of <JIP> and <UIP> should be the
2769 * same. In case of the halt instruction inside conditional code
2770 * block, the <UIP> should be the end of the program, and the
2771 * <JIP> should be end of the most inner conditional code block."
2772 *
2773 * The uip will have already been set by whoever set up the
2774 * instruction.
2775 */
2776 if (block_end_offset == 0) {
2777 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2778 } else {
2779 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2780 }
2781 assert(brw_inst_uip(devinfo, insn) != 0);
2782 assert(brw_inst_jip(devinfo, insn) != 0);
2783 break;
2784 }
2785 }
2786 }
2787
2788 void brw_ff_sync(struct brw_codegen *p,
2789 struct brw_reg dest,
2790 unsigned msg_reg_nr,
2791 struct brw_reg src0,
2792 bool allocate,
2793 unsigned response_length,
2794 bool eot)
2795 {
2796 const struct gen_device_info *devinfo = p->devinfo;
2797 brw_inst *insn;
2798
2799 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2800
2801 insn = next_insn(p, BRW_OPCODE_SEND);
2802 brw_set_dest(p, insn, dest);
2803 brw_set_src0(p, insn, src0);
2804 brw_set_src1(p, insn, brw_imm_d(0));
2805
2806 if (devinfo->gen < 6)
2807 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2808
2809 brw_set_ff_sync_message(p,
2810 insn,
2811 allocate,
2812 response_length,
2813 eot);
2814 }
2815
2816 /**
2817 * Emit the SEND instruction necessary to generate stream output data on Gen6
2818 * (for transform feedback).
2819 *
2820 * If send_commit_msg is true, this is the last piece of stream output data
2821 * from this thread, so send the data as a committed write. According to the
2822 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2823 *
2824 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2825 * writes are complete by sending the final write as a committed write."
2826 */
2827 void
2828 brw_svb_write(struct brw_codegen *p,
2829 struct brw_reg dest,
2830 unsigned msg_reg_nr,
2831 struct brw_reg src0,
2832 unsigned binding_table_index,
2833 bool send_commit_msg)
2834 {
2835 const struct gen_device_info *devinfo = p->devinfo;
2836 const unsigned target_cache =
2837 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2838 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2839 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2840 brw_inst *insn;
2841
2842 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2843
2844 insn = next_insn(p, BRW_OPCODE_SEND);
2845 brw_set_dest(p, insn, dest);
2846 brw_set_src0(p, insn, src0);
2847 brw_set_src1(p, insn, brw_imm_d(0));
2848 brw_set_dp_write_message(p, insn,
2849 binding_table_index,
2850 0, /* msg_control: ignored */
2851 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2852 target_cache,
2853 1, /* msg_length */
2854 true, /* header_present */
2855 0, /* last_render_target: ignored */
2856 send_commit_msg, /* response_length */
2857 0, /* end_of_thread */
2858 send_commit_msg); /* send_commit_msg */
2859 }
2860
2861 static unsigned
2862 brw_surface_payload_size(struct brw_codegen *p,
2863 unsigned num_channels,
2864 bool has_simd4x2,
2865 bool has_simd16)
2866 {
2867 if (has_simd4x2 && brw_get_default_access_mode(p) == BRW_ALIGN_16)
2868 return 1;
2869 else if (has_simd16 && brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2870 return 2 * num_channels;
2871 else
2872 return num_channels;
2873 }
2874
2875 static void
2876 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2877 brw_inst *insn,
2878 unsigned atomic_op,
2879 bool response_expected)
2880 {
2881 const struct gen_device_info *devinfo = p->devinfo;
2882 unsigned msg_control =
2883 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2884 (response_expected ? 1 << 5 : 0); /* Return data expected */
2885
2886 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2887 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2888 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2889 msg_control |= 1 << 4; /* SIMD8 mode */
2890
2891 brw_inst_set_dp_msg_type(devinfo, insn,
2892 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2893 } else {
2894 brw_inst_set_dp_msg_type(devinfo, insn,
2895 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2896 }
2897 } else {
2898 brw_inst_set_dp_msg_type(devinfo, insn,
2899 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2900
2901 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2902 msg_control |= 1 << 4; /* SIMD8 mode */
2903 }
2904
2905 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2906 }
2907
2908 void
2909 brw_untyped_atomic(struct brw_codegen *p,
2910 struct brw_reg dst,
2911 struct brw_reg payload,
2912 struct brw_reg surface,
2913 unsigned atomic_op,
2914 unsigned msg_length,
2915 bool response_expected,
2916 bool header_present)
2917 {
2918 const struct gen_device_info *devinfo = p->devinfo;
2919 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2920 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2921 GEN7_SFID_DATAPORT_DATA_CACHE);
2922 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2923 /* Mask out unused components -- This is especially important in Align16
2924 * mode on generations that don't have native support for SIMD4x2 atomics,
2925 * because unused but enabled components will cause the dataport to perform
2926 * additional atomic operations on the addresses that happen to be in the
2927 * uninitialized Y, Z and W coordinates of the payload.
2928 */
2929 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2930 struct brw_inst *insn = brw_send_indirect_surface_message(
2931 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2932 brw_surface_payload_size(p, response_expected,
2933 devinfo->gen >= 8 || devinfo->is_haswell, true),
2934 header_present);
2935
2936 brw_set_dp_untyped_atomic_message(
2937 p, insn, atomic_op, response_expected);
2938 }
2939
2940 static void
2941 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2942 struct brw_inst *insn,
2943 unsigned num_channels)
2944 {
2945 const struct gen_device_info *devinfo = p->devinfo;
2946 /* Set mask of 32-bit channels to drop. */
2947 unsigned msg_control = 0xf & (0xf << num_channels);
2948
2949 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2950 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2951 msg_control |= 1 << 4; /* SIMD16 mode */
2952 else
2953 msg_control |= 2 << 4; /* SIMD8 mode */
2954 }
2955
2956 brw_inst_set_dp_msg_type(devinfo, insn,
2957 (devinfo->gen >= 8 || devinfo->is_haswell ?
2958 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2959 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2960 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2961 }
2962
2963 void
2964 brw_untyped_surface_read(struct brw_codegen *p,
2965 struct brw_reg dst,
2966 struct brw_reg payload,
2967 struct brw_reg surface,
2968 unsigned msg_length,
2969 unsigned num_channels)
2970 {
2971 const struct gen_device_info *devinfo = p->devinfo;
2972 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2973 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2974 GEN7_SFID_DATAPORT_DATA_CACHE);
2975 struct brw_inst *insn = brw_send_indirect_surface_message(
2976 p, sfid, dst, payload, surface, msg_length,
2977 brw_surface_payload_size(p, num_channels, true, true),
2978 false);
2979
2980 brw_set_dp_untyped_surface_read_message(
2981 p, insn, num_channels);
2982 }
2983
2984 static void
2985 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2986 struct brw_inst *insn,
2987 unsigned num_channels)
2988 {
2989 const struct gen_device_info *devinfo = p->devinfo;
2990 /* Set mask of 32-bit channels to drop. */
2991 unsigned msg_control = 0xf & (0xf << num_channels);
2992
2993 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2994 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2995 msg_control |= 1 << 4; /* SIMD16 mode */
2996 else
2997 msg_control |= 2 << 4; /* SIMD8 mode */
2998 } else {
2999 if (devinfo->gen >= 8 || devinfo->is_haswell)
3000 msg_control |= 0 << 4; /* SIMD4x2 mode */
3001 else
3002 msg_control |= 2 << 4; /* SIMD8 mode */
3003 }
3004
3005 brw_inst_set_dp_msg_type(devinfo, insn,
3006 devinfo->gen >= 8 || devinfo->is_haswell ?
3007 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3008 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3009 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3010 }
3011
3012 void
3013 brw_untyped_surface_write(struct brw_codegen *p,
3014 struct brw_reg payload,
3015 struct brw_reg surface,
3016 unsigned msg_length,
3017 unsigned num_channels,
3018 bool header_present)
3019 {
3020 const struct gen_device_info *devinfo = p->devinfo;
3021 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3022 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3023 GEN7_SFID_DATAPORT_DATA_CACHE);
3024 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3025 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3026 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3027 WRITEMASK_X : WRITEMASK_XYZW;
3028 struct brw_inst *insn = brw_send_indirect_surface_message(
3029 p, sfid, brw_writemask(brw_null_reg(), mask),
3030 payload, surface, msg_length, 0, header_present);
3031
3032 brw_set_dp_untyped_surface_write_message(
3033 p, insn, num_channels);
3034 }
3035
3036 static unsigned
3037 brw_byte_scattered_data_element_from_bit_size(unsigned bit_size)
3038 {
3039 switch (bit_size) {
3040 case 8:
3041 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_BYTE;
3042 case 16:
3043 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_WORD;
3044 case 32:
3045 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_DWORD;
3046 default:
3047 unreachable("Unsupported bit_size for byte scattered messages");
3048 }
3049 }
3050
3051
3052 void
3053 brw_byte_scattered_read(struct brw_codegen *p,
3054 struct brw_reg dst,
3055 struct brw_reg payload,
3056 struct brw_reg surface,
3057 unsigned msg_length,
3058 unsigned bit_size)
3059 {
3060 const struct gen_device_info *devinfo = p->devinfo;
3061 assert(devinfo->gen > 7 || devinfo->is_haswell);
3062 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3063 const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3064
3065 struct brw_inst *insn = brw_send_indirect_surface_message(
3066 p, sfid, dst, payload, surface, msg_length,
3067 brw_surface_payload_size(p, 1, true, true),
3068 false);
3069
3070 unsigned msg_control =
3071 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3072
3073 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3074 msg_control |= 1; /* SIMD16 mode */
3075 else
3076 msg_control |= 0; /* SIMD8 mode */
3077
3078 brw_inst_set_dp_msg_type(devinfo, insn,
3079 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
3080 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3081 }
3082
3083 void
3084 brw_byte_scattered_write(struct brw_codegen *p,
3085 struct brw_reg payload,
3086 struct brw_reg surface,
3087 unsigned msg_length,
3088 unsigned bit_size,
3089 bool header_present)
3090 {
3091 const struct gen_device_info *devinfo = p->devinfo;
3092 assert(devinfo->gen > 7 || devinfo->is_haswell);
3093 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3094 const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3095
3096 struct brw_inst *insn = brw_send_indirect_surface_message(
3097 p, sfid, brw_writemask(brw_null_reg(), WRITEMASK_XYZW),
3098 payload, surface, msg_length, 0, header_present);
3099
3100 unsigned msg_control =
3101 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3102
3103 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3104 msg_control |= 1;
3105 else
3106 msg_control |= 0;
3107
3108 brw_inst_set_dp_msg_type(devinfo, insn,
3109 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
3110 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3111 }
3112
3113 static void
3114 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3115 struct brw_inst *insn,
3116 unsigned atomic_op,
3117 bool response_expected)
3118 {
3119 const struct gen_device_info *devinfo = p->devinfo;
3120 unsigned msg_control =
3121 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3122 (response_expected ? 1 << 5 : 0); /* Return data expected */
3123
3124 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3125 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3126 if ((brw_get_default_group(p) / 8) % 2 == 1)
3127 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3128
3129 brw_inst_set_dp_msg_type(devinfo, insn,
3130 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3131 } else {
3132 brw_inst_set_dp_msg_type(devinfo, insn,
3133 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3134 }
3135
3136 } else {
3137 brw_inst_set_dp_msg_type(devinfo, insn,
3138 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3139
3140 if ((brw_get_default_group(p) / 8) % 2 == 1)
3141 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3142 }
3143
3144 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3145 }
3146
3147 void
3148 brw_typed_atomic(struct brw_codegen *p,
3149 struct brw_reg dst,
3150 struct brw_reg payload,
3151 struct brw_reg surface,
3152 unsigned atomic_op,
3153 unsigned msg_length,
3154 bool response_expected,
3155 bool header_present) {
3156 const struct gen_device_info *devinfo = p->devinfo;
3157 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3158 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3159 GEN6_SFID_DATAPORT_RENDER_CACHE);
3160 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3161 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3162 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3163 struct brw_inst *insn = brw_send_indirect_surface_message(
3164 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3165 brw_surface_payload_size(p, response_expected,
3166 devinfo->gen >= 8 || devinfo->is_haswell, false),
3167 header_present);
3168
3169 brw_set_dp_typed_atomic_message(
3170 p, insn, atomic_op, response_expected);
3171 }
3172
3173 static void
3174 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3175 struct brw_inst *insn,
3176 unsigned num_channels)
3177 {
3178 const struct gen_device_info *devinfo = p->devinfo;
3179 /* Set mask of unused channels. */
3180 unsigned msg_control = 0xf & (0xf << num_channels);
3181
3182 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3183 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3184 if ((brw_get_default_group(p) / 8) % 2 == 1)
3185 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3186 else
3187 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3188 }
3189
3190 brw_inst_set_dp_msg_type(devinfo, insn,
3191 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3192 } else {
3193 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3194 if ((brw_get_default_group(p) / 8) % 2 == 1)
3195 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3196 }
3197
3198 brw_inst_set_dp_msg_type(devinfo, insn,
3199 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3200 }
3201
3202 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3203 }
3204
3205 void
3206 brw_typed_surface_read(struct brw_codegen *p,
3207 struct brw_reg dst,
3208 struct brw_reg payload,
3209 struct brw_reg surface,
3210 unsigned msg_length,
3211 unsigned num_channels,
3212 bool header_present)
3213 {
3214 const struct gen_device_info *devinfo = p->devinfo;
3215 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3216 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3217 GEN6_SFID_DATAPORT_RENDER_CACHE);
3218 struct brw_inst *insn = brw_send_indirect_surface_message(
3219 p, sfid, dst, payload, surface, msg_length,
3220 brw_surface_payload_size(p, num_channels,
3221 devinfo->gen >= 8 || devinfo->is_haswell, false),
3222 header_present);
3223
3224 brw_set_dp_typed_surface_read_message(
3225 p, insn, num_channels);
3226 }
3227
3228 static void
3229 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3230 struct brw_inst *insn,
3231 unsigned num_channels)
3232 {
3233 const struct gen_device_info *devinfo = p->devinfo;
3234 /* Set mask of unused channels. */
3235 unsigned msg_control = 0xf & (0xf << num_channels);
3236
3237 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3238 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3239 if ((brw_get_default_group(p) / 8) % 2 == 1)
3240 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3241 else
3242 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3243 }
3244
3245 brw_inst_set_dp_msg_type(devinfo, insn,
3246 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3247
3248 } else {
3249 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3250 if ((brw_get_default_group(p) / 8) % 2 == 1)
3251 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3252 }
3253
3254 brw_inst_set_dp_msg_type(devinfo, insn,
3255 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3256 }
3257
3258 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3259 }
3260
3261 void
3262 brw_typed_surface_write(struct brw_codegen *p,
3263 struct brw_reg payload,
3264 struct brw_reg surface,
3265 unsigned msg_length,
3266 unsigned num_channels,
3267 bool header_present)
3268 {
3269 const struct gen_device_info *devinfo = p->devinfo;
3270 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3271 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3272 GEN6_SFID_DATAPORT_RENDER_CACHE);
3273 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3274 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3275 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3276 WRITEMASK_X : WRITEMASK_XYZW);
3277 struct brw_inst *insn = brw_send_indirect_surface_message(
3278 p, sfid, brw_writemask(brw_null_reg(), mask),
3279 payload, surface, msg_length, 0, header_present);
3280
3281 brw_set_dp_typed_surface_write_message(
3282 p, insn, num_channels);
3283 }
3284
3285 static void
3286 brw_set_memory_fence_message(struct brw_codegen *p,
3287 struct brw_inst *insn,
3288 enum brw_message_target sfid,
3289 bool commit_enable)
3290 {
3291 const struct gen_device_info *devinfo = p->devinfo;
3292
3293 brw_set_message_descriptor(p, insn, sfid,
3294 1 /* message length */,
3295 (commit_enable ? 1 : 0) /* response length */,
3296 true /* header present */,
3297 false);
3298
3299 switch (sfid) {
3300 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3301 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3302 break;
3303 case GEN7_SFID_DATAPORT_DATA_CACHE:
3304 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3305 break;
3306 default:
3307 unreachable("Not reached");
3308 }
3309
3310 if (commit_enable)
3311 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3312 }
3313
3314 void
3315 brw_memory_fence(struct brw_codegen *p,
3316 struct brw_reg dst,
3317 enum opcode send_op)
3318 {
3319 const struct gen_device_info *devinfo = p->devinfo;
3320 const bool commit_enable =
3321 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3322 (devinfo->gen == 7 && !devinfo->is_haswell);
3323 struct brw_inst *insn;
3324
3325 brw_push_insn_state(p);
3326 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3327 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3328 dst = vec1(dst);
3329
3330 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3331 * message doesn't write anything back.
3332 */
3333 insn = next_insn(p, send_op);
3334 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3335 brw_set_dest(p, insn, dst);
3336 brw_set_src0(p, insn, dst);
3337 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3338 commit_enable);
3339
3340 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3341 /* IVB does typed surface access through the render cache, so we need to
3342 * flush it too. Use a different register so both flushes can be
3343 * pipelined by the hardware.
3344 */
3345 insn = next_insn(p, send_op);
3346 brw_set_dest(p, insn, offset(dst, 1));
3347 brw_set_src0(p, insn, offset(dst, 1));
3348 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3349 commit_enable);
3350
3351 /* Now write the response of the second message into the response of the
3352 * first to trigger a pipeline stall -- This way future render and data
3353 * cache messages will be properly ordered with respect to past data and
3354 * render cache messages.
3355 */
3356 brw_MOV(p, dst, offset(dst, 1));
3357 }
3358
3359 brw_pop_insn_state(p);
3360 }
3361
3362 void
3363 brw_pixel_interpolator_query(struct brw_codegen *p,
3364 struct brw_reg dest,
3365 struct brw_reg mrf,
3366 bool noperspective,
3367 unsigned mode,
3368 struct brw_reg data,
3369 unsigned msg_length,
3370 unsigned response_length)
3371 {
3372 const struct gen_device_info *devinfo = p->devinfo;
3373 struct brw_inst *insn;
3374 const uint16_t exec_size = brw_get_default_exec_size(p);
3375
3376 /* brw_send_indirect_message will automatically use a direct send message
3377 * if data is actually immediate.
3378 */
3379 insn = brw_send_indirect_message(p,
3380 GEN7_SFID_PIXEL_INTERPOLATOR,
3381 dest,
3382 mrf,
3383 vec1(data));
3384 brw_inst_set_mlen(devinfo, insn, msg_length);
3385 brw_inst_set_rlen(devinfo, insn, response_length);
3386
3387 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3388 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3389 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3390 brw_inst_set_pi_message_type(devinfo, insn, mode);
3391 }
3392
3393 void
3394 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3395 struct brw_reg mask)
3396 {
3397 const struct gen_device_info *devinfo = p->devinfo;
3398 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3399 const unsigned qtr_control = brw_get_default_group(p) / 8;
3400 brw_inst *inst;
3401
3402 assert(devinfo->gen >= 7);
3403 assert(mask.type == BRW_REGISTER_TYPE_UD);
3404
3405 brw_push_insn_state(p);
3406
3407 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3408 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3409
3410 if (devinfo->gen >= 8) {
3411 /* Getting the first active channel index is easy on Gen8: Just find
3412 * the first bit set in the execution mask. The register exists on
3413 * HSW already but it reads back as all ones when the current
3414 * instruction has execution masking disabled, so it's kind of
3415 * useless.
3416 */
3417 struct brw_reg exec_mask =
3418 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3419
3420 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3421 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3422 /* Unfortunately, ce0 does not take into account the thread
3423 * dispatch mask, which may be a problem in cases where it's not
3424 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3425 * some n). Combine ce0 with the given dispatch (or vector) mask
3426 * to mask off those channels which were never dispatched by the
3427 * hardware.
3428 */
3429 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3430 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3431 exec_mask = vec1(dst);
3432 }
3433
3434 /* Quarter control has the effect of magically shifting the value of
3435 * ce0 so you'll get the first active channel relative to the
3436 * specified quarter control as result.
3437 */
3438 inst = brw_FBL(p, vec1(dst), exec_mask);
3439 } else {
3440 const struct brw_reg flag = brw_flag_reg(p->current->flag_subreg / 2,
3441 p->current->flag_subreg % 2);
3442
3443 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3444 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3445
3446 /* Run enough instructions returning zero with execution masking and
3447 * a conditional modifier enabled in order to get the full execution
3448 * mask in f1.0. We could use a single 32-wide move here if it
3449 * weren't because of the hardware bug that causes channel enables to
3450 * be applied incorrectly to the second half of 32-wide instructions
3451 * on Gen7.
3452 */
3453 const unsigned lower_size = MIN2(16, exec_size);
3454 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3455 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3456 brw_imm_uw(0));
3457 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3458 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3459 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3460 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3461 }
3462
3463 /* Find the first bit set in the exec_size-wide portion of the flag
3464 * register that was updated by the last sequence of MOV
3465 * instructions.
3466 */
3467 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3468 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3469 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3470 }
3471 } else {
3472 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3473
3474 if (devinfo->gen >= 8 &&
3475 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3476 /* In SIMD4x2 mode the first active channel index is just the
3477 * negation of the first bit of the mask register. Note that ce0
3478 * doesn't take into account the dispatch mask, so the Gen7 path
3479 * should be used instead unless you have the guarantee that the
3480 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3481 * for some n).
3482 */
3483 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3484 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3485 brw_imm_ud(1));
3486
3487 } else {
3488 /* Overwrite the destination without and with execution masking to
3489 * find out which of the channels is active.
3490 */
3491 brw_push_insn_state(p);
3492 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3493 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3494 brw_imm_ud(1));
3495
3496 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3497 brw_imm_ud(0));
3498 brw_pop_insn_state(p);
3499 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3500 }
3501 }
3502
3503 brw_pop_insn_state(p);
3504 }
3505
3506 void
3507 brw_broadcast(struct brw_codegen *p,
3508 struct brw_reg dst,
3509 struct brw_reg src,
3510 struct brw_reg idx)
3511 {
3512 const struct gen_device_info *devinfo = p->devinfo;
3513 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3514 brw_inst *inst;
3515
3516 brw_push_insn_state(p);
3517 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3518 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3519
3520 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3521 src.address_mode == BRW_ADDRESS_DIRECT);
3522 assert(!src.abs && !src.negate);
3523 assert(src.type == dst.type);
3524
3525 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3526 idx.file == BRW_IMMEDIATE_VALUE) {
3527 /* Trivial, the source is already uniform or the index is a constant.
3528 * We will typically not get here if the optimizer is doing its job, but
3529 * asserting would be mean.
3530 */
3531 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3532 brw_MOV(p, dst,
3533 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3534 stride(suboffset(src, 4 * i), 0, 4, 1)));
3535 } else {
3536 /* From the Haswell PRM section "Register Region Restrictions":
3537 *
3538 * "The lower bits of the AddressImmediate must not overflow to
3539 * change the register address. The lower 5 bits of Address
3540 * Immediate when added to lower 5 bits of address register gives
3541 * the sub-register offset. The upper bits of Address Immediate
3542 * when added to upper bits of address register gives the register
3543 * address. Any overflow from sub-register offset is dropped."
3544 *
3545 * Fortunately, for broadcast, we never have a sub-register offset so
3546 * this isn't an issue.
3547 */
3548 assert(src.subnr == 0);
3549
3550 if (align1) {
3551 const struct brw_reg addr =
3552 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3553 unsigned offset = src.nr * REG_SIZE + src.subnr;
3554 /* Limit in bytes of the signed indirect addressing immediate. */
3555 const unsigned limit = 512;
3556
3557 brw_push_insn_state(p);
3558 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3559 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3560
3561 /* Take into account the component size and horizontal stride. */
3562 assert(src.vstride == src.hstride + src.width);
3563 brw_SHL(p, addr, vec1(idx),
3564 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3565 src.hstride - 1));
3566
3567 /* We can only address up to limit bytes using the indirect
3568 * addressing immediate, account for the difference if the source
3569 * register is above this limit.
3570 */
3571 if (offset >= limit) {
3572 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3573 offset = offset % limit;
3574 }
3575
3576 brw_pop_insn_state(p);
3577
3578 /* Use indirect addressing to fetch the specified component. */
3579 if (type_sz(src.type) > 4 &&
3580 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3581 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3582 *
3583 * "When source or destination datatype is 64b or operation is
3584 * integer DWord multiply, indirect addressing must not be
3585 * used."
3586 *
3587 * To work around both of this issue, we do two integer MOVs
3588 * insead of one 64-bit MOV. Because no double value should ever
3589 * cross a register boundary, it's safe to use the immediate
3590 * offset in the indirect here to handle adding 4 bytes to the
3591 * offset and avoid the extra ADD to the register file.
3592 */
3593 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3594 retype(brw_vec1_indirect(addr.subnr, offset),
3595 BRW_REGISTER_TYPE_D));
3596 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3597 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3598 BRW_REGISTER_TYPE_D));
3599 } else {
3600 brw_MOV(p, dst,
3601 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3602 }
3603 } else {
3604 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3605 * to all bits of a flag register,
3606 */
3607 inst = brw_MOV(p,
3608 brw_null_reg(),
3609 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3610 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3611 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3612 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3613
3614 /* and use predicated SEL to pick the right channel. */
3615 inst = brw_SEL(p, dst,
3616 stride(suboffset(src, 4), 4, 4, 1),
3617 stride(src, 4, 4, 1));
3618 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3619 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3620 }
3621 }
3622
3623 brw_pop_insn_state(p);
3624 }
3625
3626 /**
3627 * This instruction is generated as a single-channel align1 instruction by
3628 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3629 *
3630 * We can't use the typed atomic op in the FS because that has the execution
3631 * mask ANDed with the pixel mask, but we just want to write the one dword for
3632 * all the pixels.
3633 *
3634 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3635 * one u32. So we use the same untyped atomic write message as the pixel
3636 * shader.
3637 *
3638 * The untyped atomic operation requires a BUFFER surface type with RAW
3639 * format, and is only accessible through the legacy DATA_CACHE dataport
3640 * messages.
3641 */
3642 void brw_shader_time_add(struct brw_codegen *p,
3643 struct brw_reg payload,
3644 uint32_t surf_index)
3645 {
3646 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3647 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3648 GEN7_SFID_DATAPORT_DATA_CACHE);
3649 assert(p->devinfo->gen >= 7);
3650
3651 brw_push_insn_state(p);
3652 brw_set_default_access_mode(p, BRW_ALIGN_1);
3653 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3654 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3655 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3656
3657 /* We use brw_vec1_reg and unmasked because we want to increment the given
3658 * offset only once.
3659 */
3660 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3661 BRW_ARF_NULL, 0));
3662 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3663 payload.nr, 0));
3664 brw_set_src1(p, send, brw_imm_ud(0));
3665 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3666 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3667 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3668
3669 brw_pop_insn_state(p);
3670 }
3671
3672
3673 /**
3674 * Emit the SEND message for a barrier
3675 */
3676 void
3677 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3678 {
3679 const struct gen_device_info *devinfo = p->devinfo;
3680 struct brw_inst *inst;
3681
3682 assert(devinfo->gen >= 7);
3683
3684 brw_push_insn_state(p);
3685 brw_set_default_access_mode(p, BRW_ALIGN_1);
3686 inst = next_insn(p, BRW_OPCODE_SEND);
3687 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3688 brw_set_src0(p, inst, src);
3689 brw_set_src1(p, inst, brw_null_reg());
3690
3691 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3692 1 /* msg_length */,
3693 0 /* response_length */,
3694 false /* header_present */,
3695 false /* end_of_thread */);
3696
3697 brw_inst_set_gateway_notify(devinfo, inst, 1);
3698 brw_inst_set_gateway_subfuncid(devinfo, inst,
3699 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3700
3701 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3702 brw_pop_insn_state(p);
3703 }
3704
3705
3706 /**
3707 * Emit the wait instruction for a barrier
3708 */
3709 void
3710 brw_WAIT(struct brw_codegen *p)
3711 {
3712 const struct gen_device_info *devinfo = p->devinfo;
3713 struct brw_inst *insn;
3714
3715 struct brw_reg src = brw_notification_reg();
3716
3717 insn = next_insn(p, BRW_OPCODE_WAIT);
3718 brw_set_dest(p, insn, src);
3719 brw_set_src0(p, insn, src);
3720 brw_set_src1(p, insn, brw_null_reg());
3721
3722 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3723 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3724 }
3725
3726 /**
3727 * Changes the floating point rounding mode updating the control register
3728 * field defined at cr0.0[5-6] bits. This function supports the changes to
3729 * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3730 * Only RTNE and RTZ rounding are enabled at nir.
3731 */
3732 void
3733 brw_rounding_mode(struct brw_codegen *p,
3734 enum brw_rnd_mode mode)
3735 {
3736 const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
3737
3738 if (bits != BRW_CR0_RND_MODE_MASK) {
3739 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3740 brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
3741 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3742
3743 /* From the Skylake PRM, Volume 7, page 760:
3744 * "Implementation Restriction on Register Access: When the control
3745 * register is used as an explicit source and/or destination, hardware
3746 * does not ensure execution pipeline coherency. Software must set the
3747 * thread control field to ‘switch’ for an instruction that uses
3748 * control register as an explicit operand."
3749 */
3750 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3751 }
3752
3753 if (bits) {
3754 brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3755 brw_imm_ud(bits));
3756 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3757 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3758 }
3759 }