intel/eu: Fix broadcast instruction for 64-bit values on little-core
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 gen7_convert_mrf_to_grf(p, &dest);
98
99 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
100 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
101
102 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
103 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
104
105 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
106 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
107 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
108 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
109 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
110 } else {
111 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
112 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
113 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_MESSAGE_REGISTER_FILE) {
115 assert(dest.writemask != 0);
116 }
117 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
118 * Although Dst.HorzStride is a don't care for Align16, HW needs
119 * this to be programmed as "01".
120 */
121 brw_inst_set_dst_hstride(devinfo, inst, 1);
122 }
123 } else {
124 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
125
126 /* These are different sizes in align1 vs align16:
127 */
128 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
129 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
130 dest.indirect_offset);
131 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
134 } else {
135 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
136 dest.indirect_offset);
137 /* even ignored in da16, still need to set as '01' */
138 brw_inst_set_dst_hstride(devinfo, inst, 1);
139 }
140 }
141
142 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
143 * or 16 (SIMD16), as that's normally correct. However, when dealing with
144 * small registers, we automatically reduce it to match the register size.
145 *
146 * In platforms that support fp64 we can emit instructions with a width of
147 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
148 * cases we need to make sure that these instructions have their exec sizes
149 * set properly when they are emitted and we can't rely on this code to fix
150 * it.
151 */
152 bool fix_exec_size;
153 if (devinfo->gen >= 6)
154 fix_exec_size = dest.width < BRW_EXECUTE_4;
155 else
156 fix_exec_size = dest.width < BRW_EXECUTE_8;
157
158 if (fix_exec_size)
159 brw_inst_set_exec_size(devinfo, inst, dest.width);
160 }
161
162 void
163 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
164 {
165 const struct gen_device_info *devinfo = p->devinfo;
166
167 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
168 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
169 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
170 assert(reg.nr < 128);
171
172 gen7_convert_mrf_to_grf(p, &reg);
173
174 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
175 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
176 /* Any source modifiers or regions will be ignored, since this just
177 * identifies the MRF/GRF to start reading the message contents from.
178 * Check for some likely failures.
179 */
180 assert(!reg.negate);
181 assert(!reg.abs);
182 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
183 }
184
185 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
186 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
187 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
188 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
189
190 if (reg.file == BRW_IMMEDIATE_VALUE) {
191 if (reg.type == BRW_REGISTER_TYPE_DF ||
192 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
193 brw_inst_set_imm_df(devinfo, inst, reg.df);
194 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
195 reg.type == BRW_REGISTER_TYPE_Q)
196 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
197 else
198 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
199
200 if (type_sz(reg.type) < 8) {
201 brw_inst_set_src1_reg_file(devinfo, inst,
202 BRW_ARCHITECTURE_REGISTER_FILE);
203 brw_inst_set_src1_reg_hw_type(devinfo, inst,
204 brw_inst_src0_reg_hw_type(devinfo, inst));
205 }
206 } else {
207 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
208 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
209 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
210 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
211 } else {
212 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
213 }
214 } else {
215 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
216
217 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
218 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
219 } else {
220 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
221 }
222 }
223
224 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
225 if (reg.width == BRW_WIDTH_1 &&
226 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
227 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
228 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
229 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
230 } else {
231 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
232 brw_inst_set_src0_width(devinfo, inst, reg.width);
233 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
234 }
235 } else {
236 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
237 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
238 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
239 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
240 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
241 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
242 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
243 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
244
245 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
246 /* This is an oddity of the fact we're using the same
247 * descriptions for registers in align_16 as align_1:
248 */
249 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
250 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
251 reg.type == BRW_REGISTER_TYPE_DF &&
252 reg.vstride == BRW_VERTICAL_STRIDE_2) {
253 /* From SNB PRM:
254 *
255 * "For Align16 access mode, only encodings of 0000 and 0011
256 * are allowed. Other codes are reserved."
257 *
258 * Presumably the DevSNB behavior applies to IVB as well.
259 */
260 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
261 } else {
262 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
263 }
264 }
265 }
266 }
267
268
269 void
270 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
271 {
272 const struct gen_device_info *devinfo = p->devinfo;
273
274 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
275 assert(reg.nr < 128);
276
277 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
278 *
279 * "Accumulator registers may be accessed explicitly as src0
280 * operands only."
281 */
282 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
283 reg.nr != BRW_ARF_ACCUMULATOR);
284
285 gen7_convert_mrf_to_grf(p, &reg);
286 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
287
288 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
289 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
290 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
291
292 /* Only src1 can be immediate in two-argument instructions.
293 */
294 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
295
296 if (reg.file == BRW_IMMEDIATE_VALUE) {
297 /* two-argument instructions can only use 32-bit immediates */
298 assert(type_sz(reg.type) < 8);
299 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
300 } else {
301 /* This is a hardware restriction, which may or may not be lifted
302 * in the future:
303 */
304 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
305 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
306
307 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
308 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
309 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
310 } else {
311 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
312 }
313
314 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
315 if (reg.width == BRW_WIDTH_1 &&
316 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
317 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
318 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
319 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
320 } else {
321 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
322 brw_inst_set_src1_width(devinfo, inst, reg.width);
323 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
324 }
325 } else {
326 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
327 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
328 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
329 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
330 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
331 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
332 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
333 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
334
335 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
336 /* This is an oddity of the fact we're using the same
337 * descriptions for registers in align_16 as align_1:
338 */
339 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
340 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
341 reg.type == BRW_REGISTER_TYPE_DF &&
342 reg.vstride == BRW_VERTICAL_STRIDE_2) {
343 /* From SNB PRM:
344 *
345 * "For Align16 access mode, only encodings of 0000 and 0011
346 * are allowed. Other codes are reserved."
347 *
348 * Presumably the DevSNB behavior applies to IVB as well.
349 */
350 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
351 } else {
352 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
353 }
354 }
355 }
356 }
357
358 /**
359 * Set the Message Descriptor and Extended Message Descriptor fields
360 * for SEND messages.
361 *
362 * \note This zeroes out the Function Control bits, so it must be called
363 * \b before filling out any message-specific data. Callers can
364 * choose not to fill in irrelevant bits; they will be zero.
365 */
366 void
367 brw_set_message_descriptor(struct brw_codegen *p,
368 brw_inst *inst,
369 enum brw_message_target sfid,
370 unsigned msg_length,
371 unsigned response_length,
372 bool header_present,
373 bool end_of_thread)
374 {
375 const struct gen_device_info *devinfo = p->devinfo;
376
377 brw_set_src1(p, inst, brw_imm_d(0));
378
379 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
380 * itself; instead, it will be a MOV/OR into the address register.
381 *
382 * In this case, we avoid setting the extended message descriptor bits,
383 * since they go on the later SEND/SENDC instead and if set here would
384 * instead clobber the conditionalmod bits.
385 */
386 unsigned opcode = brw_inst_opcode(devinfo, inst);
387 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
388 brw_inst_set_sfid(devinfo, inst, sfid);
389 }
390
391 brw_inst_set_mlen(devinfo, inst, msg_length);
392 brw_inst_set_rlen(devinfo, inst, response_length);
393 brw_inst_set_eot(devinfo, inst, end_of_thread);
394
395 if (devinfo->gen >= 5) {
396 brw_inst_set_header_present(devinfo, inst, header_present);
397 }
398 }
399
400 static void brw_set_math_message( struct brw_codegen *p,
401 brw_inst *inst,
402 unsigned function,
403 unsigned integer_type,
404 bool low_precision,
405 unsigned dataType )
406 {
407 const struct gen_device_info *devinfo = p->devinfo;
408 unsigned msg_length;
409 unsigned response_length;
410
411 /* Infer message length from the function */
412 switch (function) {
413 case BRW_MATH_FUNCTION_POW:
414 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
415 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
416 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
417 msg_length = 2;
418 break;
419 default:
420 msg_length = 1;
421 break;
422 }
423
424 /* Infer response length from the function */
425 switch (function) {
426 case BRW_MATH_FUNCTION_SINCOS:
427 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
428 response_length = 2;
429 break;
430 default:
431 response_length = 1;
432 break;
433 }
434
435
436 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
437 msg_length, response_length, false, false);
438 brw_inst_set_math_msg_function(devinfo, inst, function);
439 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
440 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
441 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
442 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
443 brw_inst_set_saturate(devinfo, inst, 0);
444 }
445
446
447 static void brw_set_ff_sync_message(struct brw_codegen *p,
448 brw_inst *insn,
449 bool allocate,
450 unsigned response_length,
451 bool end_of_thread)
452 {
453 const struct gen_device_info *devinfo = p->devinfo;
454
455 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
456 1, response_length, true, end_of_thread);
457 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
458 brw_inst_set_urb_allocate(devinfo, insn, allocate);
459 /* The following fields are not used by FF_SYNC: */
460 brw_inst_set_urb_global_offset(devinfo, insn, 0);
461 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
462 brw_inst_set_urb_used(devinfo, insn, 0);
463 brw_inst_set_urb_complete(devinfo, insn, 0);
464 }
465
466 static void brw_set_urb_message( struct brw_codegen *p,
467 brw_inst *insn,
468 enum brw_urb_write_flags flags,
469 unsigned msg_length,
470 unsigned response_length,
471 unsigned offset,
472 unsigned swizzle_control )
473 {
474 const struct gen_device_info *devinfo = p->devinfo;
475
476 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
477 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
478 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
479
480 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
481 msg_length, response_length, true,
482 flags & BRW_URB_WRITE_EOT);
483
484 if (flags & BRW_URB_WRITE_OWORD) {
485 assert(msg_length == 2); /* header + one OWORD of data */
486 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
487 } else {
488 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
489 }
490
491 brw_inst_set_urb_global_offset(devinfo, insn, offset);
492 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
493
494 if (devinfo->gen < 8) {
495 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
496 }
497
498 if (devinfo->gen < 7) {
499 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
500 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
501 } else {
502 brw_inst_set_urb_per_slot_offset(devinfo, insn,
503 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
504 }
505 }
506
507 void
508 brw_set_dp_write_message(struct brw_codegen *p,
509 brw_inst *insn,
510 unsigned binding_table_index,
511 unsigned msg_control,
512 unsigned msg_type,
513 unsigned target_cache,
514 unsigned msg_length,
515 bool header_present,
516 unsigned last_render_target,
517 unsigned response_length,
518 unsigned end_of_thread,
519 unsigned send_commit_msg)
520 {
521 const struct gen_device_info *devinfo = p->devinfo;
522 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
523 BRW_SFID_DATAPORT_WRITE);
524
525 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
526 header_present, end_of_thread);
527
528 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
529 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
530 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
531 brw_inst_set_rt_last(devinfo, insn, last_render_target);
532 if (devinfo->gen < 7) {
533 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
534 }
535 }
536
537 void
538 brw_set_dp_read_message(struct brw_codegen *p,
539 brw_inst *insn,
540 unsigned binding_table_index,
541 unsigned msg_control,
542 unsigned msg_type,
543 unsigned target_cache,
544 unsigned msg_length,
545 bool header_present,
546 unsigned response_length)
547 {
548 const struct gen_device_info *devinfo = p->devinfo;
549 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
550 BRW_SFID_DATAPORT_READ);
551
552 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
553 header_present, false);
554
555 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
556 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
557 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
558 if (devinfo->gen < 6)
559 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
560 }
561
562 void
563 brw_set_sampler_message(struct brw_codegen *p,
564 brw_inst *inst,
565 unsigned binding_table_index,
566 unsigned sampler,
567 unsigned msg_type,
568 unsigned response_length,
569 unsigned msg_length,
570 unsigned header_present,
571 unsigned simd_mode,
572 unsigned return_format)
573 {
574 const struct gen_device_info *devinfo = p->devinfo;
575
576 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
577 response_length, header_present, false);
578
579 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
580 brw_inst_set_sampler(devinfo, inst, sampler);
581 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
582 if (devinfo->gen >= 5) {
583 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
584 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
585 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
586 }
587 }
588
589 static void
590 gen7_set_dp_scratch_message(struct brw_codegen *p,
591 brw_inst *inst,
592 bool write,
593 bool dword,
594 bool invalidate_after_read,
595 unsigned num_regs,
596 unsigned addr_offset,
597 unsigned mlen,
598 unsigned rlen,
599 bool header_present)
600 {
601 const struct gen_device_info *devinfo = p->devinfo;
602 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
603 (devinfo->gen >= 8 && num_regs == 8));
604 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
605 num_regs - 1);
606
607 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
608 mlen, rlen, header_present, false);
609 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
610 brw_inst_set_scratch_read_write(devinfo, inst, write);
611 brw_inst_set_scratch_type(devinfo, inst, dword);
612 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
613 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
614 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
615 }
616
617 #define next_insn brw_next_insn
618 brw_inst *
619 brw_next_insn(struct brw_codegen *p, unsigned opcode)
620 {
621 const struct gen_device_info *devinfo = p->devinfo;
622 brw_inst *insn;
623
624 if (p->nr_insn + 1 > p->store_size) {
625 p->store_size <<= 1;
626 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
627 }
628
629 p->next_insn_offset += 16;
630 insn = &p->store[p->nr_insn++];
631 memcpy(insn, p->current, sizeof(*insn));
632
633 brw_inst_set_opcode(devinfo, insn, opcode);
634 return insn;
635 }
636
637 static brw_inst *
638 brw_alu1(struct brw_codegen *p, unsigned opcode,
639 struct brw_reg dest, struct brw_reg src)
640 {
641 brw_inst *insn = next_insn(p, opcode);
642 brw_set_dest(p, insn, dest);
643 brw_set_src0(p, insn, src);
644 return insn;
645 }
646
647 static brw_inst *
648 brw_alu2(struct brw_codegen *p, unsigned opcode,
649 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
650 {
651 /* 64-bit immediates are only supported on 1-src instructions */
652 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
653 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
654
655 brw_inst *insn = next_insn(p, opcode);
656 brw_set_dest(p, insn, dest);
657 brw_set_src0(p, insn, src0);
658 brw_set_src1(p, insn, src1);
659 return insn;
660 }
661
662 static int
663 get_3src_subreg_nr(struct brw_reg reg)
664 {
665 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
666 * use 32-bit units (components 0..7). Since they only support F/D/UD
667 * types, this doesn't lose any flexibility, but uses fewer bits.
668 */
669 return reg.subnr / 4;
670 }
671
672 static brw_inst *
673 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
674 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
675 {
676 const struct gen_device_info *devinfo = p->devinfo;
677 brw_inst *inst = next_insn(p, opcode);
678
679 gen7_convert_mrf_to_grf(p, &dest);
680
681 assert(dest.nr < 128);
682 assert(src0.nr < 128);
683 assert(src1.nr < 128);
684 assert(src2.nr < 128);
685 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
686 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
687 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
688 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
689
690 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
691 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
692 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
693
694 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
695 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
696 BRW_ALIGN1_3SRC_ACCUMULATOR);
697 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
698 } else {
699 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
700 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
701 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
702 }
703 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
704
705 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
706
707 if (brw_reg_type_is_floating_point(dest.type)) {
708 brw_inst_set_3src_a1_exec_type(devinfo, inst,
709 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
710 } else {
711 brw_inst_set_3src_a1_exec_type(devinfo, inst,
712 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
713 }
714
715 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
716 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
717 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
718 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
719
720 assert((src0.vstride == BRW_VERTICAL_STRIDE_0 &&
721 src0.hstride == BRW_HORIZONTAL_STRIDE_0) ||
722 (src0.vstride == BRW_VERTICAL_STRIDE_8 &&
723 src0.hstride == BRW_HORIZONTAL_STRIDE_1));
724 assert((src1.vstride == BRW_VERTICAL_STRIDE_0 &&
725 src1.hstride == BRW_HORIZONTAL_STRIDE_0) ||
726 (src1.vstride == BRW_VERTICAL_STRIDE_8 &&
727 src1.hstride == BRW_HORIZONTAL_STRIDE_1));
728 assert((src2.vstride == BRW_VERTICAL_STRIDE_0 &&
729 src2.hstride == BRW_HORIZONTAL_STRIDE_0) ||
730 (src2.vstride == BRW_VERTICAL_STRIDE_8 &&
731 src2.hstride == BRW_HORIZONTAL_STRIDE_1));
732
733 brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
734 src0.vstride == BRW_VERTICAL_STRIDE_0 ?
735 BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0 :
736 BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8);
737 brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
738 src1.vstride == BRW_VERTICAL_STRIDE_0 ?
739 BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0 :
740 BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8);
741 /* no vstride on src2 */
742
743 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
744 src0.hstride == BRW_HORIZONTAL_STRIDE_0 ?
745 BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0 :
746 BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1);
747 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
748 src1.hstride == BRW_HORIZONTAL_STRIDE_0 ?
749 BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0 :
750 BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1);
751 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
752 src2.hstride == BRW_HORIZONTAL_STRIDE_0 ?
753 BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0 :
754 BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1);
755
756 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
757 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
758 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
759 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
760
761 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
762 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
763 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
764 } else {
765 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
766 }
767 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
768 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
769
770 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
771 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
772 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
773 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
774
775 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
776 src0.file == BRW_IMMEDIATE_VALUE);
777 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
778 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
779 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
780 src2.file == BRW_IMMEDIATE_VALUE);
781
782 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
783 src0.file == BRW_GENERAL_REGISTER_FILE ?
784 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
785 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
786 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
787 src1.file == BRW_GENERAL_REGISTER_FILE ?
788 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
789 BRW_ALIGN1_3SRC_ACCUMULATOR);
790 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
791 src2.file == BRW_GENERAL_REGISTER_FILE ?
792 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
793 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
794 } else {
795 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
796 dest.file == BRW_MESSAGE_REGISTER_FILE);
797 assert(dest.type == BRW_REGISTER_TYPE_F ||
798 dest.type == BRW_REGISTER_TYPE_DF ||
799 dest.type == BRW_REGISTER_TYPE_D ||
800 dest.type == BRW_REGISTER_TYPE_UD);
801 if (devinfo->gen == 6) {
802 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
803 dest.file == BRW_MESSAGE_REGISTER_FILE);
804 }
805 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
806 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
807 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
808
809 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
810 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
811 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
812 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
813 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
814 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
815 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
816 src0.vstride == BRW_VERTICAL_STRIDE_0);
817
818 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
819 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
820 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
821 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
822 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
823 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
824 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
825 src1.vstride == BRW_VERTICAL_STRIDE_0);
826
827 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
828 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
829 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
830 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
831 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
832 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
833 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
834 src2.vstride == BRW_VERTICAL_STRIDE_0);
835
836 if (devinfo->gen >= 7) {
837 /* Set both the source and destination types based on dest.type,
838 * ignoring the source register types. The MAD and LRP emitters ensure
839 * that all four types are float. The BFE and BFI2 emitters, however,
840 * may send us mixed D and UD types and want us to ignore that and use
841 * the destination type.
842 */
843 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
844 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
845 }
846 }
847
848 return inst;
849 }
850
851
852 /***********************************************************************
853 * Convenience routines.
854 */
855 #define ALU1(OP) \
856 brw_inst *brw_##OP(struct brw_codegen *p, \
857 struct brw_reg dest, \
858 struct brw_reg src0) \
859 { \
860 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
861 }
862
863 #define ALU2(OP) \
864 brw_inst *brw_##OP(struct brw_codegen *p, \
865 struct brw_reg dest, \
866 struct brw_reg src0, \
867 struct brw_reg src1) \
868 { \
869 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
870 }
871
872 #define ALU3(OP) \
873 brw_inst *brw_##OP(struct brw_codegen *p, \
874 struct brw_reg dest, \
875 struct brw_reg src0, \
876 struct brw_reg src1, \
877 struct brw_reg src2) \
878 { \
879 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
880 }
881
882 #define ALU3F(OP) \
883 brw_inst *brw_##OP(struct brw_codegen *p, \
884 struct brw_reg dest, \
885 struct brw_reg src0, \
886 struct brw_reg src1, \
887 struct brw_reg src2) \
888 { \
889 assert(dest.type == BRW_REGISTER_TYPE_F || \
890 dest.type == BRW_REGISTER_TYPE_DF); \
891 if (dest.type == BRW_REGISTER_TYPE_F) { \
892 assert(src0.type == BRW_REGISTER_TYPE_F); \
893 assert(src1.type == BRW_REGISTER_TYPE_F); \
894 assert(src2.type == BRW_REGISTER_TYPE_F); \
895 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
896 assert(src0.type == BRW_REGISTER_TYPE_DF); \
897 assert(src1.type == BRW_REGISTER_TYPE_DF); \
898 assert(src2.type == BRW_REGISTER_TYPE_DF); \
899 } \
900 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
901 }
902
903 /* Rounding operations (other than RNDD) require two instructions - the first
904 * stores a rounded value (possibly the wrong way) in the dest register, but
905 * also sets a per-channel "increment bit" in the flag register. A predicated
906 * add of 1.0 fixes dest to contain the desired result.
907 *
908 * Sandybridge and later appear to round correctly without an ADD.
909 */
910 #define ROUND(OP) \
911 void brw_##OP(struct brw_codegen *p, \
912 struct brw_reg dest, \
913 struct brw_reg src) \
914 { \
915 const struct gen_device_info *devinfo = p->devinfo; \
916 brw_inst *rnd, *add; \
917 rnd = next_insn(p, BRW_OPCODE_##OP); \
918 brw_set_dest(p, rnd, dest); \
919 brw_set_src0(p, rnd, src); \
920 \
921 if (devinfo->gen < 6) { \
922 /* turn on round-increments */ \
923 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
924 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
925 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
926 } \
927 }
928
929
930 ALU2(SEL)
931 ALU1(NOT)
932 ALU2(AND)
933 ALU2(OR)
934 ALU2(XOR)
935 ALU2(SHR)
936 ALU2(SHL)
937 ALU1(DIM)
938 ALU2(ASR)
939 ALU1(FRC)
940 ALU1(RNDD)
941 ALU2(MAC)
942 ALU2(MACH)
943 ALU1(LZD)
944 ALU2(DP4)
945 ALU2(DPH)
946 ALU2(DP3)
947 ALU2(DP2)
948 ALU3F(MAD)
949 ALU3F(LRP)
950 ALU1(BFREV)
951 ALU3(BFE)
952 ALU2(BFI1)
953 ALU3(BFI2)
954 ALU1(FBH)
955 ALU1(FBL)
956 ALU1(CBIT)
957 ALU2(ADDC)
958 ALU2(SUBB)
959
960 ROUND(RNDZ)
961 ROUND(RNDE)
962
963 brw_inst *
964 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
965 {
966 const struct gen_device_info *devinfo = p->devinfo;
967
968 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
969 * To avoid the problems that causes, we use a <1,2,0> source region to read
970 * each element twice.
971 */
972 if (devinfo->gen == 7 && !devinfo->is_haswell &&
973 brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1 &&
974 dest.type == BRW_REGISTER_TYPE_DF &&
975 (src0.type == BRW_REGISTER_TYPE_F ||
976 src0.type == BRW_REGISTER_TYPE_D ||
977 src0.type == BRW_REGISTER_TYPE_UD) &&
978 !has_scalar_region(src0)) {
979 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
980 src0.width == BRW_WIDTH_4 &&
981 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
982
983 src0.vstride = BRW_VERTICAL_STRIDE_1;
984 src0.width = BRW_WIDTH_2;
985 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
986 }
987
988 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
989 }
990
991 brw_inst *
992 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
993 struct brw_reg src0, struct brw_reg src1)
994 {
995 /* 6.2.2: add */
996 if (src0.type == BRW_REGISTER_TYPE_F ||
997 (src0.file == BRW_IMMEDIATE_VALUE &&
998 src0.type == BRW_REGISTER_TYPE_VF)) {
999 assert(src1.type != BRW_REGISTER_TYPE_UD);
1000 assert(src1.type != BRW_REGISTER_TYPE_D);
1001 }
1002
1003 if (src1.type == BRW_REGISTER_TYPE_F ||
1004 (src1.file == BRW_IMMEDIATE_VALUE &&
1005 src1.type == BRW_REGISTER_TYPE_VF)) {
1006 assert(src0.type != BRW_REGISTER_TYPE_UD);
1007 assert(src0.type != BRW_REGISTER_TYPE_D);
1008 }
1009
1010 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1011 }
1012
1013 brw_inst *
1014 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1015 struct brw_reg src0, struct brw_reg src1)
1016 {
1017 assert(dest.type == src0.type);
1018 assert(src0.type == src1.type);
1019 switch (src0.type) {
1020 case BRW_REGISTER_TYPE_B:
1021 case BRW_REGISTER_TYPE_UB:
1022 case BRW_REGISTER_TYPE_W:
1023 case BRW_REGISTER_TYPE_UW:
1024 case BRW_REGISTER_TYPE_D:
1025 case BRW_REGISTER_TYPE_UD:
1026 break;
1027 default:
1028 unreachable("Bad type for brw_AVG");
1029 }
1030
1031 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1032 }
1033
1034 brw_inst *
1035 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1036 struct brw_reg src0, struct brw_reg src1)
1037 {
1038 /* 6.32.38: mul */
1039 if (src0.type == BRW_REGISTER_TYPE_D ||
1040 src0.type == BRW_REGISTER_TYPE_UD ||
1041 src1.type == BRW_REGISTER_TYPE_D ||
1042 src1.type == BRW_REGISTER_TYPE_UD) {
1043 assert(dest.type != BRW_REGISTER_TYPE_F);
1044 }
1045
1046 if (src0.type == BRW_REGISTER_TYPE_F ||
1047 (src0.file == BRW_IMMEDIATE_VALUE &&
1048 src0.type == BRW_REGISTER_TYPE_VF)) {
1049 assert(src1.type != BRW_REGISTER_TYPE_UD);
1050 assert(src1.type != BRW_REGISTER_TYPE_D);
1051 }
1052
1053 if (src1.type == BRW_REGISTER_TYPE_F ||
1054 (src1.file == BRW_IMMEDIATE_VALUE &&
1055 src1.type == BRW_REGISTER_TYPE_VF)) {
1056 assert(src0.type != BRW_REGISTER_TYPE_UD);
1057 assert(src0.type != BRW_REGISTER_TYPE_D);
1058 }
1059
1060 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1061 src0.nr != BRW_ARF_ACCUMULATOR);
1062 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1063 src1.nr != BRW_ARF_ACCUMULATOR);
1064
1065 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1066 }
1067
1068 brw_inst *
1069 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1070 struct brw_reg src0, struct brw_reg src1)
1071 {
1072 src0.vstride = BRW_VERTICAL_STRIDE_0;
1073 src0.width = BRW_WIDTH_1;
1074 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1075 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1076 }
1077
1078 brw_inst *
1079 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1080 struct brw_reg src0, struct brw_reg src1)
1081 {
1082 src0.vstride = BRW_VERTICAL_STRIDE_0;
1083 src0.width = BRW_WIDTH_1;
1084 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1085 src1.vstride = BRW_VERTICAL_STRIDE_8;
1086 src1.width = BRW_WIDTH_8;
1087 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1088 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1089 }
1090
1091 brw_inst *
1092 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1093 {
1094 const struct gen_device_info *devinfo = p->devinfo;
1095 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1096 /* The F32TO16 instruction doesn't support 32-bit destination types in
1097 * Align1 mode, and neither does the Gen8 implementation in terms of a
1098 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1099 * an undocumented feature.
1100 */
1101 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1102 (!align16 || devinfo->gen >= 8));
1103 brw_inst *inst;
1104
1105 if (align16) {
1106 assert(dst.type == BRW_REGISTER_TYPE_UD);
1107 } else {
1108 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1109 dst.type == BRW_REGISTER_TYPE_W ||
1110 dst.type == BRW_REGISTER_TYPE_UW ||
1111 dst.type == BRW_REGISTER_TYPE_HF);
1112 }
1113
1114 brw_push_insn_state(p);
1115
1116 if (needs_zero_fill) {
1117 brw_set_default_access_mode(p, BRW_ALIGN_1);
1118 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1119 }
1120
1121 if (devinfo->gen >= 8) {
1122 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1123 } else {
1124 assert(devinfo->gen == 7);
1125 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1126 }
1127
1128 if (needs_zero_fill) {
1129 brw_inst_set_no_dd_clear(devinfo, inst, true);
1130 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1131 brw_inst_set_no_dd_check(devinfo, inst, true);
1132 }
1133
1134 brw_pop_insn_state(p);
1135 return inst;
1136 }
1137
1138 brw_inst *
1139 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1140 {
1141 const struct gen_device_info *devinfo = p->devinfo;
1142 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1143
1144 if (align16) {
1145 assert(src.type == BRW_REGISTER_TYPE_UD);
1146 } else {
1147 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1148 *
1149 * Because this instruction does not have a 16-bit floating-point
1150 * type, the source data type must be Word (W). The destination type
1151 * must be F (Float).
1152 */
1153 if (src.type == BRW_REGISTER_TYPE_UD)
1154 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1155
1156 assert(src.type == BRW_REGISTER_TYPE_W ||
1157 src.type == BRW_REGISTER_TYPE_UW ||
1158 src.type == BRW_REGISTER_TYPE_HF);
1159 }
1160
1161 if (devinfo->gen >= 8) {
1162 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1163 } else {
1164 assert(devinfo->gen == 7);
1165 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1166 }
1167 }
1168
1169
1170 void brw_NOP(struct brw_codegen *p)
1171 {
1172 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1173 memset(insn, 0, sizeof(*insn));
1174 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1175 }
1176
1177
1178
1179
1180
1181 /***********************************************************************
1182 * Comparisons, if/else/endif
1183 */
1184
1185 brw_inst *
1186 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1187 unsigned predicate_control)
1188 {
1189 const struct gen_device_info *devinfo = p->devinfo;
1190 struct brw_reg ip = brw_ip_reg();
1191 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1192
1193 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1194 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1195 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1196 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1197
1198 return inst;
1199 }
1200
1201 static void
1202 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1203 {
1204 p->if_stack[p->if_stack_depth] = inst - p->store;
1205
1206 p->if_stack_depth++;
1207 if (p->if_stack_array_size <= p->if_stack_depth) {
1208 p->if_stack_array_size *= 2;
1209 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1210 p->if_stack_array_size);
1211 }
1212 }
1213
1214 static brw_inst *
1215 pop_if_stack(struct brw_codegen *p)
1216 {
1217 p->if_stack_depth--;
1218 return &p->store[p->if_stack[p->if_stack_depth]];
1219 }
1220
1221 static void
1222 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1223 {
1224 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1225 p->loop_stack_array_size *= 2;
1226 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1227 p->loop_stack_array_size);
1228 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1229 p->loop_stack_array_size);
1230 }
1231
1232 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1233 p->loop_stack_depth++;
1234 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1235 }
1236
1237 static brw_inst *
1238 get_inner_do_insn(struct brw_codegen *p)
1239 {
1240 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1241 }
1242
1243 /* EU takes the value from the flag register and pushes it onto some
1244 * sort of a stack (presumably merging with any flag value already on
1245 * the stack). Within an if block, the flags at the top of the stack
1246 * control execution on each channel of the unit, eg. on each of the
1247 * 16 pixel values in our wm programs.
1248 *
1249 * When the matching 'else' instruction is reached (presumably by
1250 * countdown of the instruction count patched in by our ELSE/ENDIF
1251 * functions), the relevant flags are inverted.
1252 *
1253 * When the matching 'endif' instruction is reached, the flags are
1254 * popped off. If the stack is now empty, normal execution resumes.
1255 */
1256 brw_inst *
1257 brw_IF(struct brw_codegen *p, unsigned execute_size)
1258 {
1259 const struct gen_device_info *devinfo = p->devinfo;
1260 brw_inst *insn;
1261
1262 insn = next_insn(p, BRW_OPCODE_IF);
1263
1264 /* Override the defaults for this instruction:
1265 */
1266 if (devinfo->gen < 6) {
1267 brw_set_dest(p, insn, brw_ip_reg());
1268 brw_set_src0(p, insn, brw_ip_reg());
1269 brw_set_src1(p, insn, brw_imm_d(0x0));
1270 } else if (devinfo->gen == 6) {
1271 brw_set_dest(p, insn, brw_imm_w(0));
1272 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1273 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1274 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1275 } else if (devinfo->gen == 7) {
1276 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1277 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1278 brw_set_src1(p, insn, brw_imm_w(0));
1279 brw_inst_set_jip(devinfo, insn, 0);
1280 brw_inst_set_uip(devinfo, insn, 0);
1281 } else {
1282 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1283 brw_set_src0(p, insn, brw_imm_d(0));
1284 brw_inst_set_jip(devinfo, insn, 0);
1285 brw_inst_set_uip(devinfo, insn, 0);
1286 }
1287
1288 brw_inst_set_exec_size(devinfo, insn, execute_size);
1289 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1290 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1291 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1292 if (!p->single_program_flow && devinfo->gen < 6)
1293 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1294
1295 push_if_stack(p, insn);
1296 p->if_depth_in_loop[p->loop_stack_depth]++;
1297 return insn;
1298 }
1299
1300 /* This function is only used for gen6-style IF instructions with an
1301 * embedded comparison (conditional modifier). It is not used on gen7.
1302 */
1303 brw_inst *
1304 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1305 struct brw_reg src0, struct brw_reg src1)
1306 {
1307 const struct gen_device_info *devinfo = p->devinfo;
1308 brw_inst *insn;
1309
1310 insn = next_insn(p, BRW_OPCODE_IF);
1311
1312 brw_set_dest(p, insn, brw_imm_w(0));
1313 brw_inst_set_exec_size(devinfo, insn,
1314 brw_inst_exec_size(devinfo, p->current));
1315 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1316 brw_set_src0(p, insn, src0);
1317 brw_set_src1(p, insn, src1);
1318
1319 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1320 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1321 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1322
1323 push_if_stack(p, insn);
1324 return insn;
1325 }
1326
1327 /**
1328 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1329 */
1330 static void
1331 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1332 brw_inst *if_inst, brw_inst *else_inst)
1333 {
1334 const struct gen_device_info *devinfo = p->devinfo;
1335
1336 /* The next instruction (where the ENDIF would be, if it existed) */
1337 brw_inst *next_inst = &p->store[p->nr_insn];
1338
1339 assert(p->single_program_flow);
1340 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1341 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1342 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1343
1344 /* Convert IF to an ADD instruction that moves the instruction pointer
1345 * to the first instruction of the ELSE block. If there is no ELSE
1346 * block, point to where ENDIF would be. Reverse the predicate.
1347 *
1348 * There's no need to execute an ENDIF since we don't need to do any
1349 * stack operations, and if we're currently executing, we just want to
1350 * continue normally.
1351 */
1352 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1353 brw_inst_set_pred_inv(devinfo, if_inst, true);
1354
1355 if (else_inst != NULL) {
1356 /* Convert ELSE to an ADD instruction that points where the ENDIF
1357 * would be.
1358 */
1359 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1360
1361 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1362 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1363 } else {
1364 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1365 }
1366 }
1367
1368 /**
1369 * Patch IF and ELSE instructions with appropriate jump targets.
1370 */
1371 static void
1372 patch_IF_ELSE(struct brw_codegen *p,
1373 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1374 {
1375 const struct gen_device_info *devinfo = p->devinfo;
1376
1377 /* We shouldn't be patching IF and ELSE instructions in single program flow
1378 * mode when gen < 6, because in single program flow mode on those
1379 * platforms, we convert flow control instructions to conditional ADDs that
1380 * operate on IP (see brw_ENDIF).
1381 *
1382 * However, on Gen6, writing to IP doesn't work in single program flow mode
1383 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1384 * not be updated by non-flow control instructions."). And on later
1385 * platforms, there is no significant benefit to converting control flow
1386 * instructions to conditional ADDs. So we do patch IF and ELSE
1387 * instructions in single program flow mode on those platforms.
1388 */
1389 if (devinfo->gen < 6)
1390 assert(!p->single_program_flow);
1391
1392 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1393 assert(endif_inst != NULL);
1394 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1395
1396 unsigned br = brw_jump_scale(devinfo);
1397
1398 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1399 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1400
1401 if (else_inst == NULL) {
1402 /* Patch IF -> ENDIF */
1403 if (devinfo->gen < 6) {
1404 /* Turn it into an IFF, which means no mask stack operations for
1405 * all-false and jumping past the ENDIF.
1406 */
1407 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1408 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1409 br * (endif_inst - if_inst + 1));
1410 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1411 } else if (devinfo->gen == 6) {
1412 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1413 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1414 } else {
1415 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1416 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1417 }
1418 } else {
1419 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1420
1421 /* Patch IF -> ELSE */
1422 if (devinfo->gen < 6) {
1423 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1424 br * (else_inst - if_inst));
1425 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1426 } else if (devinfo->gen == 6) {
1427 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1428 br * (else_inst - if_inst + 1));
1429 }
1430
1431 /* Patch ELSE -> ENDIF */
1432 if (devinfo->gen < 6) {
1433 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1434 * matching ENDIF.
1435 */
1436 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1437 br * (endif_inst - else_inst + 1));
1438 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1439 } else if (devinfo->gen == 6) {
1440 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1441 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1442 br * (endif_inst - else_inst));
1443 } else {
1444 /* The IF instruction's JIP should point just past the ELSE */
1445 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1446 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1447 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1448 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1449 if (devinfo->gen >= 8) {
1450 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1451 * should point to ENDIF.
1452 */
1453 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1454 }
1455 }
1456 }
1457 }
1458
1459 void
1460 brw_ELSE(struct brw_codegen *p)
1461 {
1462 const struct gen_device_info *devinfo = p->devinfo;
1463 brw_inst *insn;
1464
1465 insn = next_insn(p, BRW_OPCODE_ELSE);
1466
1467 if (devinfo->gen < 6) {
1468 brw_set_dest(p, insn, brw_ip_reg());
1469 brw_set_src0(p, insn, brw_ip_reg());
1470 brw_set_src1(p, insn, brw_imm_d(0x0));
1471 } else if (devinfo->gen == 6) {
1472 brw_set_dest(p, insn, brw_imm_w(0));
1473 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1474 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1475 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1476 } else if (devinfo->gen == 7) {
1477 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1478 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1479 brw_set_src1(p, insn, brw_imm_w(0));
1480 brw_inst_set_jip(devinfo, insn, 0);
1481 brw_inst_set_uip(devinfo, insn, 0);
1482 } else {
1483 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1484 brw_set_src0(p, insn, brw_imm_d(0));
1485 brw_inst_set_jip(devinfo, insn, 0);
1486 brw_inst_set_uip(devinfo, insn, 0);
1487 }
1488
1489 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1490 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1491 if (!p->single_program_flow && devinfo->gen < 6)
1492 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1493
1494 push_if_stack(p, insn);
1495 }
1496
1497 void
1498 brw_ENDIF(struct brw_codegen *p)
1499 {
1500 const struct gen_device_info *devinfo = p->devinfo;
1501 brw_inst *insn = NULL;
1502 brw_inst *else_inst = NULL;
1503 brw_inst *if_inst = NULL;
1504 brw_inst *tmp;
1505 bool emit_endif = true;
1506
1507 /* In single program flow mode, we can express IF and ELSE instructions
1508 * equivalently as ADD instructions that operate on IP. On platforms prior
1509 * to Gen6, flow control instructions cause an implied thread switch, so
1510 * this is a significant savings.
1511 *
1512 * However, on Gen6, writing to IP doesn't work in single program flow mode
1513 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1514 * not be updated by non-flow control instructions."). And on later
1515 * platforms, there is no significant benefit to converting control flow
1516 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1517 * Gen5.
1518 */
1519 if (devinfo->gen < 6 && p->single_program_flow)
1520 emit_endif = false;
1521
1522 /*
1523 * A single next_insn() may change the base address of instruction store
1524 * memory(p->store), so call it first before referencing the instruction
1525 * store pointer from an index
1526 */
1527 if (emit_endif)
1528 insn = next_insn(p, BRW_OPCODE_ENDIF);
1529
1530 /* Pop the IF and (optional) ELSE instructions from the stack */
1531 p->if_depth_in_loop[p->loop_stack_depth]--;
1532 tmp = pop_if_stack(p);
1533 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1534 else_inst = tmp;
1535 tmp = pop_if_stack(p);
1536 }
1537 if_inst = tmp;
1538
1539 if (!emit_endif) {
1540 /* ENDIF is useless; don't bother emitting it. */
1541 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1542 return;
1543 }
1544
1545 if (devinfo->gen < 6) {
1546 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1548 brw_set_src1(p, insn, brw_imm_d(0x0));
1549 } else if (devinfo->gen == 6) {
1550 brw_set_dest(p, insn, brw_imm_w(0));
1551 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1552 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1553 } else if (devinfo->gen == 7) {
1554 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1555 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1556 brw_set_src1(p, insn, brw_imm_w(0));
1557 } else {
1558 brw_set_src0(p, insn, brw_imm_d(0));
1559 }
1560
1561 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1562 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1563 if (devinfo->gen < 6)
1564 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1565
1566 /* Also pop item off the stack in the endif instruction: */
1567 if (devinfo->gen < 6) {
1568 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1569 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1570 } else if (devinfo->gen == 6) {
1571 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1572 } else {
1573 brw_inst_set_jip(devinfo, insn, 2);
1574 }
1575 patch_IF_ELSE(p, if_inst, else_inst, insn);
1576 }
1577
1578 brw_inst *
1579 brw_BREAK(struct brw_codegen *p)
1580 {
1581 const struct gen_device_info *devinfo = p->devinfo;
1582 brw_inst *insn;
1583
1584 insn = next_insn(p, BRW_OPCODE_BREAK);
1585 if (devinfo->gen >= 8) {
1586 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1587 brw_set_src0(p, insn, brw_imm_d(0x0));
1588 } else if (devinfo->gen >= 6) {
1589 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1590 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1591 brw_set_src1(p, insn, brw_imm_d(0x0));
1592 } else {
1593 brw_set_dest(p, insn, brw_ip_reg());
1594 brw_set_src0(p, insn, brw_ip_reg());
1595 brw_set_src1(p, insn, brw_imm_d(0x0));
1596 brw_inst_set_gen4_pop_count(devinfo, insn,
1597 p->if_depth_in_loop[p->loop_stack_depth]);
1598 }
1599 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1600 brw_inst_set_exec_size(devinfo, insn,
1601 brw_inst_exec_size(devinfo, p->current));
1602
1603 return insn;
1604 }
1605
1606 brw_inst *
1607 brw_CONT(struct brw_codegen *p)
1608 {
1609 const struct gen_device_info *devinfo = p->devinfo;
1610 brw_inst *insn;
1611
1612 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1613 brw_set_dest(p, insn, brw_ip_reg());
1614 if (devinfo->gen >= 8) {
1615 brw_set_src0(p, insn, brw_imm_d(0x0));
1616 } else {
1617 brw_set_src0(p, insn, brw_ip_reg());
1618 brw_set_src1(p, insn, brw_imm_d(0x0));
1619 }
1620
1621 if (devinfo->gen < 6) {
1622 brw_inst_set_gen4_pop_count(devinfo, insn,
1623 p->if_depth_in_loop[p->loop_stack_depth]);
1624 }
1625 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1626 brw_inst_set_exec_size(devinfo, insn,
1627 brw_inst_exec_size(devinfo, p->current));
1628 return insn;
1629 }
1630
1631 brw_inst *
1632 gen6_HALT(struct brw_codegen *p)
1633 {
1634 const struct gen_device_info *devinfo = p->devinfo;
1635 brw_inst *insn;
1636
1637 insn = next_insn(p, BRW_OPCODE_HALT);
1638 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1639 if (devinfo->gen >= 8) {
1640 brw_set_src0(p, insn, brw_imm_d(0x0));
1641 } else {
1642 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1643 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1644 }
1645
1646 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1647 brw_inst_set_exec_size(devinfo, insn,
1648 brw_inst_exec_size(devinfo, p->current));
1649 return insn;
1650 }
1651
1652 /* DO/WHILE loop:
1653 *
1654 * The DO/WHILE is just an unterminated loop -- break or continue are
1655 * used for control within the loop. We have a few ways they can be
1656 * done.
1657 *
1658 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1659 * jip and no DO instruction.
1660 *
1661 * For non-uniform control flow pre-gen6, there's a DO instruction to
1662 * push the mask, and a WHILE to jump back, and BREAK to get out and
1663 * pop the mask.
1664 *
1665 * For gen6, there's no more mask stack, so no need for DO. WHILE
1666 * just points back to the first instruction of the loop.
1667 */
1668 brw_inst *
1669 brw_DO(struct brw_codegen *p, unsigned execute_size)
1670 {
1671 const struct gen_device_info *devinfo = p->devinfo;
1672
1673 if (devinfo->gen >= 6 || p->single_program_flow) {
1674 push_loop_stack(p, &p->store[p->nr_insn]);
1675 return &p->store[p->nr_insn];
1676 } else {
1677 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1678
1679 push_loop_stack(p, insn);
1680
1681 /* Override the defaults for this instruction:
1682 */
1683 brw_set_dest(p, insn, brw_null_reg());
1684 brw_set_src0(p, insn, brw_null_reg());
1685 brw_set_src1(p, insn, brw_null_reg());
1686
1687 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1688 brw_inst_set_exec_size(devinfo, insn, execute_size);
1689 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1690
1691 return insn;
1692 }
1693 }
1694
1695 /**
1696 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1697 * instruction here.
1698 *
1699 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1700 * nesting, since it can always just point to the end of the block/current loop.
1701 */
1702 static void
1703 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1704 {
1705 const struct gen_device_info *devinfo = p->devinfo;
1706 brw_inst *do_inst = get_inner_do_insn(p);
1707 brw_inst *inst;
1708 unsigned br = brw_jump_scale(devinfo);
1709
1710 assert(devinfo->gen < 6);
1711
1712 for (inst = while_inst - 1; inst != do_inst; inst--) {
1713 /* If the jump count is != 0, that means that this instruction has already
1714 * been patched because it's part of a loop inside of the one we're
1715 * patching.
1716 */
1717 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1718 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1719 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1720 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1721 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1722 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1723 }
1724 }
1725 }
1726
1727 brw_inst *
1728 brw_WHILE(struct brw_codegen *p)
1729 {
1730 const struct gen_device_info *devinfo = p->devinfo;
1731 brw_inst *insn, *do_insn;
1732 unsigned br = brw_jump_scale(devinfo);
1733
1734 if (devinfo->gen >= 6) {
1735 insn = next_insn(p, BRW_OPCODE_WHILE);
1736 do_insn = get_inner_do_insn(p);
1737
1738 if (devinfo->gen >= 8) {
1739 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1740 brw_set_src0(p, insn, brw_imm_d(0));
1741 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1742 } else if (devinfo->gen == 7) {
1743 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1744 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1745 brw_set_src1(p, insn, brw_imm_w(0));
1746 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1747 } else {
1748 brw_set_dest(p, insn, brw_imm_w(0));
1749 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1750 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1751 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1752 }
1753
1754 brw_inst_set_exec_size(devinfo, insn,
1755 brw_inst_exec_size(devinfo, p->current));
1756
1757 } else {
1758 if (p->single_program_flow) {
1759 insn = next_insn(p, BRW_OPCODE_ADD);
1760 do_insn = get_inner_do_insn(p);
1761
1762 brw_set_dest(p, insn, brw_ip_reg());
1763 brw_set_src0(p, insn, brw_ip_reg());
1764 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1765 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1766 } else {
1767 insn = next_insn(p, BRW_OPCODE_WHILE);
1768 do_insn = get_inner_do_insn(p);
1769
1770 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1771
1772 brw_set_dest(p, insn, brw_ip_reg());
1773 brw_set_src0(p, insn, brw_ip_reg());
1774 brw_set_src1(p, insn, brw_imm_d(0));
1775
1776 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1777 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1778 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1779
1780 brw_patch_break_cont(p, insn);
1781 }
1782 }
1783 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1784
1785 p->loop_stack_depth--;
1786
1787 return insn;
1788 }
1789
1790 /* FORWARD JUMPS:
1791 */
1792 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1793 {
1794 const struct gen_device_info *devinfo = p->devinfo;
1795 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1796 unsigned jmpi = 1;
1797
1798 if (devinfo->gen >= 5)
1799 jmpi = 2;
1800
1801 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1802 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1803
1804 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1805 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1806 }
1807
1808 /* To integrate with the above, it makes sense that the comparison
1809 * instruction should populate the flag register. It might be simpler
1810 * just to use the flag reg for most WM tasks?
1811 */
1812 void brw_CMP(struct brw_codegen *p,
1813 struct brw_reg dest,
1814 unsigned conditional,
1815 struct brw_reg src0,
1816 struct brw_reg src1)
1817 {
1818 const struct gen_device_info *devinfo = p->devinfo;
1819 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1820
1821 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1822 brw_set_dest(p, insn, dest);
1823 brw_set_src0(p, insn, src0);
1824 brw_set_src1(p, insn, src1);
1825
1826 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1827 * page says:
1828 * "Any CMP instruction with a null destination must use a {switch}."
1829 *
1830 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1831 * mentioned on their work-arounds pages.
1832 */
1833 if (devinfo->gen == 7) {
1834 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1835 dest.nr == BRW_ARF_NULL) {
1836 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1837 }
1838 }
1839 }
1840
1841 /***********************************************************************
1842 * Helpers for the various SEND message types:
1843 */
1844
1845 /** Extended math function, float[8].
1846 */
1847 void gen4_math(struct brw_codegen *p,
1848 struct brw_reg dest,
1849 unsigned function,
1850 unsigned msg_reg_nr,
1851 struct brw_reg src,
1852 unsigned precision )
1853 {
1854 const struct gen_device_info *devinfo = p->devinfo;
1855 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1856 unsigned data_type;
1857 if (has_scalar_region(src)) {
1858 data_type = BRW_MATH_DATA_SCALAR;
1859 } else {
1860 data_type = BRW_MATH_DATA_VECTOR;
1861 }
1862
1863 assert(devinfo->gen < 6);
1864
1865 /* Example code doesn't set predicate_control for send
1866 * instructions.
1867 */
1868 brw_inst_set_pred_control(devinfo, insn, 0);
1869 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1870
1871 brw_set_dest(p, insn, dest);
1872 brw_set_src0(p, insn, src);
1873 brw_set_math_message(p,
1874 insn,
1875 function,
1876 src.type == BRW_REGISTER_TYPE_D,
1877 precision,
1878 data_type);
1879 }
1880
1881 void gen6_math(struct brw_codegen *p,
1882 struct brw_reg dest,
1883 unsigned function,
1884 struct brw_reg src0,
1885 struct brw_reg src1)
1886 {
1887 const struct gen_device_info *devinfo = p->devinfo;
1888 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1889
1890 assert(devinfo->gen >= 6);
1891
1892 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1893 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1894
1895 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1896 if (devinfo->gen == 6) {
1897 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1898 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1899 }
1900
1901 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1902 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1903 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1904 assert(src0.type != BRW_REGISTER_TYPE_F);
1905 assert(src1.type != BRW_REGISTER_TYPE_F);
1906 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1907 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1908 } else {
1909 assert(src0.type == BRW_REGISTER_TYPE_F);
1910 assert(src1.type == BRW_REGISTER_TYPE_F);
1911 }
1912
1913 /* Source modifiers are ignored for extended math instructions on Gen6. */
1914 if (devinfo->gen == 6) {
1915 assert(!src0.negate);
1916 assert(!src0.abs);
1917 assert(!src1.negate);
1918 assert(!src1.abs);
1919 }
1920
1921 brw_inst_set_math_function(devinfo, insn, function);
1922
1923 brw_set_dest(p, insn, dest);
1924 brw_set_src0(p, insn, src0);
1925 brw_set_src1(p, insn, src1);
1926 }
1927
1928 /**
1929 * Return the right surface index to access the thread scratch space using
1930 * stateless dataport messages.
1931 */
1932 unsigned
1933 brw_scratch_surface_idx(const struct brw_codegen *p)
1934 {
1935 /* The scratch space is thread-local so IA coherency is unnecessary. */
1936 if (p->devinfo->gen >= 8)
1937 return GEN8_BTI_STATELESS_NON_COHERENT;
1938 else
1939 return BRW_BTI_STATELESS;
1940 }
1941
1942 /**
1943 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1944 * using a constant offset per channel.
1945 *
1946 * The offset must be aligned to oword size (16 bytes). Used for
1947 * register spilling.
1948 */
1949 void brw_oword_block_write_scratch(struct brw_codegen *p,
1950 struct brw_reg mrf,
1951 int num_regs,
1952 unsigned offset)
1953 {
1954 const struct gen_device_info *devinfo = p->devinfo;
1955 const unsigned target_cache =
1956 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1957 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1958 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
1959 uint32_t msg_type;
1960
1961 if (devinfo->gen >= 6)
1962 offset /= 16;
1963
1964 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1965
1966 const unsigned mlen = 1 + num_regs;
1967
1968 /* Set up the message header. This is g0, with g0.2 filled with
1969 * the offset. We don't want to leave our offset around in g0 or
1970 * it'll screw up texture samples, so set it up inside the message
1971 * reg.
1972 */
1973 {
1974 brw_push_insn_state(p);
1975 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1976 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1977 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1978
1979 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1980
1981 /* set message header global offset field (reg 0, element 2) */
1982 brw_MOV(p,
1983 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1984 mrf.nr,
1985 2), BRW_REGISTER_TYPE_UD),
1986 brw_imm_ud(offset));
1987
1988 brw_pop_insn_state(p);
1989 }
1990
1991 {
1992 struct brw_reg dest;
1993 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1994 int send_commit_msg;
1995 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1996 BRW_REGISTER_TYPE_UW);
1997
1998 brw_inst_set_compression(devinfo, insn, false);
1999
2000 if (brw_inst_exec_size(devinfo, insn) >= 16)
2001 src_header = vec16(src_header);
2002
2003 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2004 if (devinfo->gen < 6)
2005 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2006
2007 /* Until gen6, writes followed by reads from the same location
2008 * are not guaranteed to be ordered unless write_commit is set.
2009 * If set, then a no-op write is issued to the destination
2010 * register to set a dependency, and a read from the destination
2011 * can be used to ensure the ordering.
2012 *
2013 * For gen6, only writes between different threads need ordering
2014 * protection. Our use of DP writes is all about register
2015 * spilling within a thread.
2016 */
2017 if (devinfo->gen >= 6) {
2018 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2019 send_commit_msg = 0;
2020 } else {
2021 dest = src_header;
2022 send_commit_msg = 1;
2023 }
2024
2025 brw_set_dest(p, insn, dest);
2026 if (devinfo->gen >= 6) {
2027 brw_set_src0(p, insn, mrf);
2028 } else {
2029 brw_set_src0(p, insn, brw_null_reg());
2030 }
2031
2032 if (devinfo->gen >= 6)
2033 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2034 else
2035 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2036
2037 brw_set_dp_write_message(p,
2038 insn,
2039 brw_scratch_surface_idx(p),
2040 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2041 msg_type,
2042 target_cache,
2043 mlen,
2044 true, /* header_present */
2045 0, /* not a render target */
2046 send_commit_msg, /* response_length */
2047 0, /* eot */
2048 send_commit_msg);
2049 }
2050 }
2051
2052
2053 /**
2054 * Read a block of owords (half a GRF each) from the scratch buffer
2055 * using a constant index per channel.
2056 *
2057 * Offset must be aligned to oword size (16 bytes). Used for register
2058 * spilling.
2059 */
2060 void
2061 brw_oword_block_read_scratch(struct brw_codegen *p,
2062 struct brw_reg dest,
2063 struct brw_reg mrf,
2064 int num_regs,
2065 unsigned offset)
2066 {
2067 const struct gen_device_info *devinfo = p->devinfo;
2068
2069 if (devinfo->gen >= 6)
2070 offset /= 16;
2071
2072 if (p->devinfo->gen >= 7) {
2073 /* On gen 7 and above, we no longer have message registers and we can
2074 * send from any register we want. By using the destination register
2075 * for the message, we guarantee that the implied message write won't
2076 * accidentally overwrite anything. This has been a problem because
2077 * the MRF registers and source for the final FB write are both fixed
2078 * and may overlap.
2079 */
2080 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2081 } else {
2082 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2083 }
2084 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2085
2086 const unsigned rlen = num_regs;
2087 const unsigned target_cache =
2088 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2089 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2090 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2091
2092 {
2093 brw_push_insn_state(p);
2094 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2095 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2096 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2097
2098 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2099
2100 /* set message header global offset field (reg 0, element 2) */
2101 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2102
2103 brw_pop_insn_state(p);
2104 }
2105
2106 {
2107 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2108
2109 assert(brw_inst_pred_control(devinfo, insn) == 0);
2110 brw_inst_set_compression(devinfo, insn, false);
2111
2112 brw_set_dest(p, insn, dest); /* UW? */
2113 if (devinfo->gen >= 6) {
2114 brw_set_src0(p, insn, mrf);
2115 } else {
2116 brw_set_src0(p, insn, brw_null_reg());
2117 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2118 }
2119
2120 brw_set_dp_read_message(p,
2121 insn,
2122 brw_scratch_surface_idx(p),
2123 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2124 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2125 target_cache,
2126 1, /* msg_length */
2127 true, /* header_present */
2128 rlen);
2129 }
2130 }
2131
2132 void
2133 gen7_block_read_scratch(struct brw_codegen *p,
2134 struct brw_reg dest,
2135 int num_regs,
2136 unsigned offset)
2137 {
2138 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2139 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2140
2141 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2142
2143 /* The HW requires that the header is present; this is to get the g0.5
2144 * scratch offset.
2145 */
2146 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2147
2148 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2149 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2150 * is 32 bytes, which happens to be the size of a register.
2151 */
2152 offset /= REG_SIZE;
2153 assert(offset < (1 << 12));
2154
2155 gen7_set_dp_scratch_message(p, insn,
2156 false, /* scratch read */
2157 false, /* OWords */
2158 false, /* invalidate after read */
2159 num_regs,
2160 offset,
2161 1, /* mlen: just g0 */
2162 num_regs, /* rlen */
2163 true); /* header present */
2164 }
2165
2166 /**
2167 * Read float[4] vectors from the data port constant cache.
2168 * Location (in buffer) should be a multiple of 16.
2169 * Used for fetching shader constants.
2170 */
2171 void brw_oword_block_read(struct brw_codegen *p,
2172 struct brw_reg dest,
2173 struct brw_reg mrf,
2174 uint32_t offset,
2175 uint32_t bind_table_index)
2176 {
2177 const struct gen_device_info *devinfo = p->devinfo;
2178 const unsigned target_cache =
2179 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2180 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2181 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
2182
2183 /* On newer hardware, offset is in units of owords. */
2184 if (devinfo->gen >= 6)
2185 offset /= 16;
2186
2187 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2188
2189 brw_push_insn_state(p);
2190 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2191 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2192 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2193
2194 brw_push_insn_state(p);
2195 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2196 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2197
2198 /* set message header global offset field (reg 0, element 2) */
2199 brw_MOV(p,
2200 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2201 mrf.nr,
2202 2), BRW_REGISTER_TYPE_UD),
2203 brw_imm_ud(offset));
2204 brw_pop_insn_state(p);
2205
2206 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2207
2208 /* cast dest to a uword[8] vector */
2209 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2210
2211 brw_set_dest(p, insn, dest);
2212 if (devinfo->gen >= 6) {
2213 brw_set_src0(p, insn, mrf);
2214 } else {
2215 brw_set_src0(p, insn, brw_null_reg());
2216 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2217 }
2218
2219 brw_set_dp_read_message(p, insn, bind_table_index,
2220 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2221 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2222 target_cache,
2223 1, /* msg_length */
2224 true, /* header_present */
2225 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2226
2227 brw_pop_insn_state(p);
2228 }
2229
2230
2231 void brw_fb_WRITE(struct brw_codegen *p,
2232 struct brw_reg payload,
2233 struct brw_reg implied_header,
2234 unsigned msg_control,
2235 unsigned binding_table_index,
2236 unsigned msg_length,
2237 unsigned response_length,
2238 bool eot,
2239 bool last_render_target,
2240 bool header_present)
2241 {
2242 const struct gen_device_info *devinfo = p->devinfo;
2243 const unsigned target_cache =
2244 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2245 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2246 brw_inst *insn;
2247 unsigned msg_type;
2248 struct brw_reg dest, src0;
2249
2250 if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2251 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2252 else
2253 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2254
2255 if (devinfo->gen >= 6) {
2256 insn = next_insn(p, BRW_OPCODE_SENDC);
2257 } else {
2258 insn = next_insn(p, BRW_OPCODE_SEND);
2259 }
2260 brw_inst_set_compression(devinfo, insn, false);
2261
2262 if (devinfo->gen >= 6) {
2263 /* headerless version, just submit color payload */
2264 src0 = payload;
2265
2266 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2267 } else {
2268 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2269 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2270 src0 = implied_header;
2271
2272 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2273 }
2274
2275 brw_set_dest(p, insn, dest);
2276 brw_set_src0(p, insn, src0);
2277 brw_set_dp_write_message(p,
2278 insn,
2279 binding_table_index,
2280 msg_control,
2281 msg_type,
2282 target_cache,
2283 msg_length,
2284 header_present,
2285 last_render_target,
2286 response_length,
2287 eot,
2288 0 /* send_commit_msg */);
2289 }
2290
2291 brw_inst *
2292 gen9_fb_READ(struct brw_codegen *p,
2293 struct brw_reg dst,
2294 struct brw_reg payload,
2295 unsigned binding_table_index,
2296 unsigned msg_length,
2297 unsigned response_length,
2298 bool per_sample)
2299 {
2300 const struct gen_device_info *devinfo = p->devinfo;
2301 assert(devinfo->gen >= 9);
2302 const unsigned msg_subtype =
2303 brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2304 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2305
2306 brw_set_dest(p, insn, dst);
2307 brw_set_src0(p, insn, payload);
2308 brw_set_dp_read_message(p, insn, binding_table_index,
2309 per_sample << 5 | msg_subtype,
2310 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2311 GEN6_SFID_DATAPORT_RENDER_CACHE,
2312 msg_length, true /* header_present */,
2313 response_length);
2314 brw_inst_set_rt_slot_group(devinfo, insn,
2315 brw_inst_qtr_control(devinfo, p->current) / 2);
2316
2317 return insn;
2318 }
2319
2320 /**
2321 * Texture sample instruction.
2322 * Note: the msg_type plus msg_length values determine exactly what kind
2323 * of sampling operation is performed. See volume 4, page 161 of docs.
2324 */
2325 void brw_SAMPLE(struct brw_codegen *p,
2326 struct brw_reg dest,
2327 unsigned msg_reg_nr,
2328 struct brw_reg src0,
2329 unsigned binding_table_index,
2330 unsigned sampler,
2331 unsigned msg_type,
2332 unsigned response_length,
2333 unsigned msg_length,
2334 unsigned header_present,
2335 unsigned simd_mode,
2336 unsigned return_format)
2337 {
2338 const struct gen_device_info *devinfo = p->devinfo;
2339 brw_inst *insn;
2340
2341 if (msg_reg_nr != -1)
2342 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2343
2344 insn = next_insn(p, BRW_OPCODE_SEND);
2345 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2346
2347 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2348 *
2349 * "Instruction compression is not allowed for this instruction (that
2350 * is, send). The hardware behavior is undefined if this instruction is
2351 * set as compressed. However, compress control can be set to "SecHalf"
2352 * to affect the EMask generation."
2353 *
2354 * No similar wording is found in later PRMs, but there are examples
2355 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2356 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2357 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2358 */
2359 brw_inst_set_compression(devinfo, insn, false);
2360
2361 if (devinfo->gen < 6)
2362 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2363
2364 brw_set_dest(p, insn, dest);
2365 brw_set_src0(p, insn, src0);
2366 brw_set_sampler_message(p, insn,
2367 binding_table_index,
2368 sampler,
2369 msg_type,
2370 response_length,
2371 msg_length,
2372 header_present,
2373 simd_mode,
2374 return_format);
2375 }
2376
2377 /* Adjust the message header's sampler state pointer to
2378 * select the correct group of 16 samplers.
2379 */
2380 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2381 struct brw_reg header,
2382 struct brw_reg sampler_index)
2383 {
2384 /* The "Sampler Index" field can only store values between 0 and 15.
2385 * However, we can add an offset to the "Sampler State Pointer"
2386 * field, effectively selecting a different set of 16 samplers.
2387 *
2388 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2389 * offset, and each sampler state is only 16-bytes, so we can't
2390 * exclusively use the offset - we have to use both.
2391 */
2392
2393 const struct gen_device_info *devinfo = p->devinfo;
2394
2395 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2396 const int sampler_state_size = 16; /* 16 bytes */
2397 uint32_t sampler = sampler_index.ud;
2398
2399 if (sampler >= 16) {
2400 assert(devinfo->is_haswell || devinfo->gen >= 8);
2401 brw_ADD(p,
2402 get_element_ud(header, 3),
2403 get_element_ud(brw_vec8_grf(0, 0), 3),
2404 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2405 }
2406 } else {
2407 /* Non-const sampler array indexing case */
2408 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2409 return;
2410 }
2411
2412 struct brw_reg temp = get_element_ud(header, 3);
2413
2414 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2415 brw_SHL(p, temp, temp, brw_imm_ud(4));
2416 brw_ADD(p,
2417 get_element_ud(header, 3),
2418 get_element_ud(brw_vec8_grf(0, 0), 3),
2419 temp);
2420 }
2421 }
2422
2423 /* All these variables are pretty confusing - we might be better off
2424 * using bitmasks and macros for this, in the old style. Or perhaps
2425 * just having the caller instantiate the fields in dword3 itself.
2426 */
2427 void brw_urb_WRITE(struct brw_codegen *p,
2428 struct brw_reg dest,
2429 unsigned msg_reg_nr,
2430 struct brw_reg src0,
2431 enum brw_urb_write_flags flags,
2432 unsigned msg_length,
2433 unsigned response_length,
2434 unsigned offset,
2435 unsigned swizzle)
2436 {
2437 const struct gen_device_info *devinfo = p->devinfo;
2438 brw_inst *insn;
2439
2440 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2441
2442 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2443 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2444 brw_push_insn_state(p);
2445 brw_set_default_access_mode(p, BRW_ALIGN_1);
2446 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2447 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2448 BRW_REGISTER_TYPE_UD),
2449 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2450 brw_imm_ud(0xff00));
2451 brw_pop_insn_state(p);
2452 }
2453
2454 insn = next_insn(p, BRW_OPCODE_SEND);
2455
2456 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2457
2458 brw_set_dest(p, insn, dest);
2459 brw_set_src0(p, insn, src0);
2460 brw_set_src1(p, insn, brw_imm_d(0));
2461
2462 if (devinfo->gen < 6)
2463 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2464
2465 brw_set_urb_message(p,
2466 insn,
2467 flags,
2468 msg_length,
2469 response_length,
2470 offset,
2471 swizzle);
2472 }
2473
2474 struct brw_inst *
2475 brw_send_indirect_message(struct brw_codegen *p,
2476 unsigned sfid,
2477 struct brw_reg dst,
2478 struct brw_reg payload,
2479 struct brw_reg desc)
2480 {
2481 const struct gen_device_info *devinfo = p->devinfo;
2482 struct brw_inst *send;
2483 int setup;
2484
2485 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2486
2487 assert(desc.type == BRW_REGISTER_TYPE_UD);
2488
2489 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2490 * in the indirect case) by its index in the instruction store. The
2491 * pointer returned by next_insn() may become invalid if emitting the SEND
2492 * in the indirect case reallocs the store.
2493 */
2494
2495 if (desc.file == BRW_IMMEDIATE_VALUE) {
2496 setup = p->nr_insn;
2497 send = next_insn(p, BRW_OPCODE_SEND);
2498 brw_set_src1(p, send, desc);
2499
2500 } else {
2501 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2502
2503 brw_push_insn_state(p);
2504 brw_set_default_access_mode(p, BRW_ALIGN_1);
2505 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2506 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2507
2508 /* Load the indirect descriptor to an address register using OR so the
2509 * caller can specify additional descriptor bits with the usual
2510 * brw_set_*_message() helper functions.
2511 */
2512 setup = p->nr_insn;
2513 brw_OR(p, addr, desc, brw_imm_ud(0));
2514
2515 brw_pop_insn_state(p);
2516
2517 send = next_insn(p, BRW_OPCODE_SEND);
2518 brw_set_src1(p, send, addr);
2519 }
2520
2521 if (dst.width < BRW_EXECUTE_8)
2522 brw_inst_set_exec_size(devinfo, send, dst.width);
2523
2524 brw_set_dest(p, send, dst);
2525 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2526 brw_inst_set_sfid(devinfo, send, sfid);
2527
2528 return &p->store[setup];
2529 }
2530
2531 static struct brw_inst *
2532 brw_send_indirect_surface_message(struct brw_codegen *p,
2533 unsigned sfid,
2534 struct brw_reg dst,
2535 struct brw_reg payload,
2536 struct brw_reg surface,
2537 unsigned message_len,
2538 unsigned response_len,
2539 bool header_present)
2540 {
2541 const struct gen_device_info *devinfo = p->devinfo;
2542 struct brw_inst *insn;
2543
2544 if (surface.file != BRW_IMMEDIATE_VALUE) {
2545 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2546
2547 brw_push_insn_state(p);
2548 brw_set_default_access_mode(p, BRW_ALIGN_1);
2549 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2550 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2551
2552 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2553 * some surface array is accessed out of bounds.
2554 */
2555 insn = brw_AND(p, addr,
2556 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2557 BRW_GET_SWZ(surface.swizzle, 0)),
2558 brw_imm_ud(0xff));
2559
2560 brw_pop_insn_state(p);
2561
2562 surface = addr;
2563 }
2564
2565 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2566 brw_inst_set_mlen(devinfo, insn, message_len);
2567 brw_inst_set_rlen(devinfo, insn, response_len);
2568 brw_inst_set_header_present(devinfo, insn, header_present);
2569
2570 return insn;
2571 }
2572
2573 static bool
2574 while_jumps_before_offset(const struct gen_device_info *devinfo,
2575 brw_inst *insn, int while_offset, int start_offset)
2576 {
2577 int scale = 16 / brw_jump_scale(devinfo);
2578 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2579 : brw_inst_jip(devinfo, insn);
2580 assert(jip < 0);
2581 return while_offset + jip * scale <= start_offset;
2582 }
2583
2584
2585 static int
2586 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2587 {
2588 int offset;
2589 void *store = p->store;
2590 const struct gen_device_info *devinfo = p->devinfo;
2591
2592 int depth = 0;
2593
2594 for (offset = next_offset(devinfo, store, start_offset);
2595 offset < p->next_insn_offset;
2596 offset = next_offset(devinfo, store, offset)) {
2597 brw_inst *insn = store + offset;
2598
2599 switch (brw_inst_opcode(devinfo, insn)) {
2600 case BRW_OPCODE_IF:
2601 depth++;
2602 break;
2603 case BRW_OPCODE_ENDIF:
2604 if (depth == 0)
2605 return offset;
2606 depth--;
2607 break;
2608 case BRW_OPCODE_WHILE:
2609 /* If the while doesn't jump before our instruction, it's the end
2610 * of a sibling do...while loop. Ignore it.
2611 */
2612 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2613 continue;
2614 /* fallthrough */
2615 case BRW_OPCODE_ELSE:
2616 case BRW_OPCODE_HALT:
2617 if (depth == 0)
2618 return offset;
2619 }
2620 }
2621
2622 return 0;
2623 }
2624
2625 /* There is no DO instruction on gen6, so to find the end of the loop
2626 * we have to see if the loop is jumping back before our start
2627 * instruction.
2628 */
2629 static int
2630 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2631 {
2632 const struct gen_device_info *devinfo = p->devinfo;
2633 int offset;
2634 void *store = p->store;
2635
2636 assert(devinfo->gen >= 6);
2637
2638 /* Always start after the instruction (such as a WHILE) we're trying to fix
2639 * up.
2640 */
2641 for (offset = next_offset(devinfo, store, start_offset);
2642 offset < p->next_insn_offset;
2643 offset = next_offset(devinfo, store, offset)) {
2644 brw_inst *insn = store + offset;
2645
2646 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2647 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2648 return offset;
2649 }
2650 }
2651 assert(!"not reached");
2652 return start_offset;
2653 }
2654
2655 /* After program generation, go back and update the UIP and JIP of
2656 * BREAK, CONT, and HALT instructions to their correct locations.
2657 */
2658 void
2659 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2660 {
2661 const struct gen_device_info *devinfo = p->devinfo;
2662 int offset;
2663 int br = brw_jump_scale(devinfo);
2664 int scale = 16 / br;
2665 void *store = p->store;
2666
2667 if (devinfo->gen < 6)
2668 return;
2669
2670 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2671 brw_inst *insn = store + offset;
2672 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2673
2674 int block_end_offset = brw_find_next_block_end(p, offset);
2675 switch (brw_inst_opcode(devinfo, insn)) {
2676 case BRW_OPCODE_BREAK:
2677 assert(block_end_offset != 0);
2678 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2679 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2680 brw_inst_set_uip(devinfo, insn,
2681 (brw_find_loop_end(p, offset) - offset +
2682 (devinfo->gen == 6 ? 16 : 0)) / scale);
2683 break;
2684 case BRW_OPCODE_CONTINUE:
2685 assert(block_end_offset != 0);
2686 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2687 brw_inst_set_uip(devinfo, insn,
2688 (brw_find_loop_end(p, offset) - offset) / scale);
2689
2690 assert(brw_inst_uip(devinfo, insn) != 0);
2691 assert(brw_inst_jip(devinfo, insn) != 0);
2692 break;
2693
2694 case BRW_OPCODE_ENDIF: {
2695 int32_t jump = (block_end_offset == 0) ?
2696 1 * br : (block_end_offset - offset) / scale;
2697 if (devinfo->gen >= 7)
2698 brw_inst_set_jip(devinfo, insn, jump);
2699 else
2700 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2701 break;
2702 }
2703
2704 case BRW_OPCODE_HALT:
2705 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2706 *
2707 * "In case of the halt instruction not inside any conditional
2708 * code block, the value of <JIP> and <UIP> should be the
2709 * same. In case of the halt instruction inside conditional code
2710 * block, the <UIP> should be the end of the program, and the
2711 * <JIP> should be end of the most inner conditional code block."
2712 *
2713 * The uip will have already been set by whoever set up the
2714 * instruction.
2715 */
2716 if (block_end_offset == 0) {
2717 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2718 } else {
2719 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2720 }
2721 assert(brw_inst_uip(devinfo, insn) != 0);
2722 assert(brw_inst_jip(devinfo, insn) != 0);
2723 break;
2724 }
2725 }
2726 }
2727
2728 void brw_ff_sync(struct brw_codegen *p,
2729 struct brw_reg dest,
2730 unsigned msg_reg_nr,
2731 struct brw_reg src0,
2732 bool allocate,
2733 unsigned response_length,
2734 bool eot)
2735 {
2736 const struct gen_device_info *devinfo = p->devinfo;
2737 brw_inst *insn;
2738
2739 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2740
2741 insn = next_insn(p, BRW_OPCODE_SEND);
2742 brw_set_dest(p, insn, dest);
2743 brw_set_src0(p, insn, src0);
2744 brw_set_src1(p, insn, brw_imm_d(0));
2745
2746 if (devinfo->gen < 6)
2747 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2748
2749 brw_set_ff_sync_message(p,
2750 insn,
2751 allocate,
2752 response_length,
2753 eot);
2754 }
2755
2756 /**
2757 * Emit the SEND instruction necessary to generate stream output data on Gen6
2758 * (for transform feedback).
2759 *
2760 * If send_commit_msg is true, this is the last piece of stream output data
2761 * from this thread, so send the data as a committed write. According to the
2762 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2763 *
2764 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2765 * writes are complete by sending the final write as a committed write."
2766 */
2767 void
2768 brw_svb_write(struct brw_codegen *p,
2769 struct brw_reg dest,
2770 unsigned msg_reg_nr,
2771 struct brw_reg src0,
2772 unsigned binding_table_index,
2773 bool send_commit_msg)
2774 {
2775 const struct gen_device_info *devinfo = p->devinfo;
2776 const unsigned target_cache =
2777 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2778 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2779 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2780 brw_inst *insn;
2781
2782 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2783
2784 insn = next_insn(p, BRW_OPCODE_SEND);
2785 brw_set_dest(p, insn, dest);
2786 brw_set_src0(p, insn, src0);
2787 brw_set_src1(p, insn, brw_imm_d(0));
2788 brw_set_dp_write_message(p, insn,
2789 binding_table_index,
2790 0, /* msg_control: ignored */
2791 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2792 target_cache,
2793 1, /* msg_length */
2794 true, /* header_present */
2795 0, /* last_render_target: ignored */
2796 send_commit_msg, /* response_length */
2797 0, /* end_of_thread */
2798 send_commit_msg); /* send_commit_msg */
2799 }
2800
2801 static unsigned
2802 brw_surface_payload_size(struct brw_codegen *p,
2803 unsigned num_channels,
2804 bool has_simd4x2,
2805 bool has_simd16)
2806 {
2807 if (has_simd4x2 &&
2808 brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2809 return 1;
2810 else if (has_simd16 &&
2811 brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2812 return 2 * num_channels;
2813 else
2814 return num_channels;
2815 }
2816
2817 static void
2818 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2819 brw_inst *insn,
2820 unsigned atomic_op,
2821 bool response_expected)
2822 {
2823 const struct gen_device_info *devinfo = p->devinfo;
2824 unsigned msg_control =
2825 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2826 (response_expected ? 1 << 5 : 0); /* Return data expected */
2827
2828 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2829 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2830 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2831 msg_control |= 1 << 4; /* SIMD8 mode */
2832
2833 brw_inst_set_dp_msg_type(devinfo, insn,
2834 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2835 } else {
2836 brw_inst_set_dp_msg_type(devinfo, insn,
2837 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2838 }
2839 } else {
2840 brw_inst_set_dp_msg_type(devinfo, insn,
2841 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2842
2843 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2844 msg_control |= 1 << 4; /* SIMD8 mode */
2845 }
2846
2847 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2848 }
2849
2850 void
2851 brw_untyped_atomic(struct brw_codegen *p,
2852 struct brw_reg dst,
2853 struct brw_reg payload,
2854 struct brw_reg surface,
2855 unsigned atomic_op,
2856 unsigned msg_length,
2857 bool response_expected)
2858 {
2859 const struct gen_device_info *devinfo = p->devinfo;
2860 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2861 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2862 GEN7_SFID_DATAPORT_DATA_CACHE);
2863 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2864 /* Mask out unused components -- This is especially important in Align16
2865 * mode on generations that don't have native support for SIMD4x2 atomics,
2866 * because unused but enabled components will cause the dataport to perform
2867 * additional atomic operations on the addresses that happen to be in the
2868 * uninitialized Y, Z and W coordinates of the payload.
2869 */
2870 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2871 struct brw_inst *insn = brw_send_indirect_surface_message(
2872 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2873 brw_surface_payload_size(p, response_expected,
2874 devinfo->gen >= 8 || devinfo->is_haswell, true),
2875 align1);
2876
2877 brw_set_dp_untyped_atomic_message(
2878 p, insn, atomic_op, response_expected);
2879 }
2880
2881 static void
2882 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2883 struct brw_inst *insn,
2884 unsigned num_channels)
2885 {
2886 const struct gen_device_info *devinfo = p->devinfo;
2887 /* Set mask of 32-bit channels to drop. */
2888 unsigned msg_control = 0xf & (0xf << num_channels);
2889
2890 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2891 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2892 msg_control |= 1 << 4; /* SIMD16 mode */
2893 else
2894 msg_control |= 2 << 4; /* SIMD8 mode */
2895 }
2896
2897 brw_inst_set_dp_msg_type(devinfo, insn,
2898 (devinfo->gen >= 8 || devinfo->is_haswell ?
2899 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2900 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2901 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2902 }
2903
2904 void
2905 brw_untyped_surface_read(struct brw_codegen *p,
2906 struct brw_reg dst,
2907 struct brw_reg payload,
2908 struct brw_reg surface,
2909 unsigned msg_length,
2910 unsigned num_channels)
2911 {
2912 const struct gen_device_info *devinfo = p->devinfo;
2913 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2914 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2915 GEN7_SFID_DATAPORT_DATA_CACHE);
2916 struct brw_inst *insn = brw_send_indirect_surface_message(
2917 p, sfid, dst, payload, surface, msg_length,
2918 brw_surface_payload_size(p, num_channels, true, true),
2919 false);
2920
2921 brw_set_dp_untyped_surface_read_message(
2922 p, insn, num_channels);
2923 }
2924
2925 static void
2926 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2927 struct brw_inst *insn,
2928 unsigned num_channels)
2929 {
2930 const struct gen_device_info *devinfo = p->devinfo;
2931 /* Set mask of 32-bit channels to drop. */
2932 unsigned msg_control = 0xf & (0xf << num_channels);
2933
2934 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2935 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2936 msg_control |= 1 << 4; /* SIMD16 mode */
2937 else
2938 msg_control |= 2 << 4; /* SIMD8 mode */
2939 } else {
2940 if (devinfo->gen >= 8 || devinfo->is_haswell)
2941 msg_control |= 0 << 4; /* SIMD4x2 mode */
2942 else
2943 msg_control |= 2 << 4; /* SIMD8 mode */
2944 }
2945
2946 brw_inst_set_dp_msg_type(devinfo, insn,
2947 devinfo->gen >= 8 || devinfo->is_haswell ?
2948 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
2949 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
2950 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2951 }
2952
2953 void
2954 brw_untyped_surface_write(struct brw_codegen *p,
2955 struct brw_reg payload,
2956 struct brw_reg surface,
2957 unsigned msg_length,
2958 unsigned num_channels)
2959 {
2960 const struct gen_device_info *devinfo = p->devinfo;
2961 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2962 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2963 GEN7_SFID_DATAPORT_DATA_CACHE);
2964 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2965 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2966 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
2967 WRITEMASK_X : WRITEMASK_XYZW;
2968 struct brw_inst *insn = brw_send_indirect_surface_message(
2969 p, sfid, brw_writemask(brw_null_reg(), mask),
2970 payload, surface, msg_length, 0, align1);
2971
2972 brw_set_dp_untyped_surface_write_message(
2973 p, insn, num_channels);
2974 }
2975
2976 static void
2977 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
2978 struct brw_inst *insn,
2979 unsigned atomic_op,
2980 bool response_expected)
2981 {
2982 const struct gen_device_info *devinfo = p->devinfo;
2983 unsigned msg_control =
2984 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2985 (response_expected ? 1 << 5 : 0); /* Return data expected */
2986
2987 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2988 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2989 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
2990 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
2991
2992 brw_inst_set_dp_msg_type(devinfo, insn,
2993 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
2994 } else {
2995 brw_inst_set_dp_msg_type(devinfo, insn,
2996 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
2997 }
2998
2999 } else {
3000 brw_inst_set_dp_msg_type(devinfo, insn,
3001 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3002
3003 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3004 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3005 }
3006
3007 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3008 }
3009
3010 void
3011 brw_typed_atomic(struct brw_codegen *p,
3012 struct brw_reg dst,
3013 struct brw_reg payload,
3014 struct brw_reg surface,
3015 unsigned atomic_op,
3016 unsigned msg_length,
3017 bool response_expected) {
3018 const struct gen_device_info *devinfo = p->devinfo;
3019 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3020 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3021 GEN6_SFID_DATAPORT_RENDER_CACHE);
3022 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3023 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3024 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3025 struct brw_inst *insn = brw_send_indirect_surface_message(
3026 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3027 brw_surface_payload_size(p, response_expected,
3028 devinfo->gen >= 8 || devinfo->is_haswell, false),
3029 true);
3030
3031 brw_set_dp_typed_atomic_message(
3032 p, insn, atomic_op, response_expected);
3033 }
3034
3035 static void
3036 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3037 struct brw_inst *insn,
3038 unsigned num_channels)
3039 {
3040 const struct gen_device_info *devinfo = p->devinfo;
3041 /* Set mask of unused channels. */
3042 unsigned msg_control = 0xf & (0xf << num_channels);
3043
3044 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3045 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3046 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3047 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3048 else
3049 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3050 }
3051
3052 brw_inst_set_dp_msg_type(devinfo, insn,
3053 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3054 } else {
3055 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3056 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3057 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3058 }
3059
3060 brw_inst_set_dp_msg_type(devinfo, insn,
3061 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3062 }
3063
3064 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3065 }
3066
3067 void
3068 brw_typed_surface_read(struct brw_codegen *p,
3069 struct brw_reg dst,
3070 struct brw_reg payload,
3071 struct brw_reg surface,
3072 unsigned msg_length,
3073 unsigned num_channels)
3074 {
3075 const struct gen_device_info *devinfo = p->devinfo;
3076 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3077 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3078 GEN6_SFID_DATAPORT_RENDER_CACHE);
3079 struct brw_inst *insn = brw_send_indirect_surface_message(
3080 p, sfid, dst, payload, surface, msg_length,
3081 brw_surface_payload_size(p, num_channels,
3082 devinfo->gen >= 8 || devinfo->is_haswell, false),
3083 true);
3084
3085 brw_set_dp_typed_surface_read_message(
3086 p, insn, num_channels);
3087 }
3088
3089 static void
3090 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3091 struct brw_inst *insn,
3092 unsigned num_channels)
3093 {
3094 const struct gen_device_info *devinfo = p->devinfo;
3095 /* Set mask of unused channels. */
3096 unsigned msg_control = 0xf & (0xf << num_channels);
3097
3098 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3099 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3100 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3101 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3102 else
3103 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3104 }
3105
3106 brw_inst_set_dp_msg_type(devinfo, insn,
3107 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3108
3109 } else {
3110 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3111 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3112 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3113 }
3114
3115 brw_inst_set_dp_msg_type(devinfo, insn,
3116 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3117 }
3118
3119 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3120 }
3121
3122 void
3123 brw_typed_surface_write(struct brw_codegen *p,
3124 struct brw_reg payload,
3125 struct brw_reg surface,
3126 unsigned msg_length,
3127 unsigned num_channels)
3128 {
3129 const struct gen_device_info *devinfo = p->devinfo;
3130 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3131 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3132 GEN6_SFID_DATAPORT_RENDER_CACHE);
3133 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3134 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3135 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3136 WRITEMASK_X : WRITEMASK_XYZW);
3137 struct brw_inst *insn = brw_send_indirect_surface_message(
3138 p, sfid, brw_writemask(brw_null_reg(), mask),
3139 payload, surface, msg_length, 0, true);
3140
3141 brw_set_dp_typed_surface_write_message(
3142 p, insn, num_channels);
3143 }
3144
3145 static void
3146 brw_set_memory_fence_message(struct brw_codegen *p,
3147 struct brw_inst *insn,
3148 enum brw_message_target sfid,
3149 bool commit_enable)
3150 {
3151 const struct gen_device_info *devinfo = p->devinfo;
3152
3153 brw_set_message_descriptor(p, insn, sfid,
3154 1 /* message length */,
3155 (commit_enable ? 1 : 0) /* response length */,
3156 true /* header present */,
3157 false);
3158
3159 switch (sfid) {
3160 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3161 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3162 break;
3163 case GEN7_SFID_DATAPORT_DATA_CACHE:
3164 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3165 break;
3166 default:
3167 unreachable("Not reached");
3168 }
3169
3170 if (commit_enable)
3171 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3172 }
3173
3174 void
3175 brw_memory_fence(struct brw_codegen *p,
3176 struct brw_reg dst)
3177 {
3178 const struct gen_device_info *devinfo = p->devinfo;
3179 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3180 struct brw_inst *insn;
3181
3182 brw_push_insn_state(p);
3183 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3184 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3185 dst = vec1(dst);
3186
3187 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3188 * message doesn't write anything back.
3189 */
3190 insn = next_insn(p, BRW_OPCODE_SEND);
3191 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3192 brw_set_dest(p, insn, dst);
3193 brw_set_src0(p, insn, dst);
3194 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3195 commit_enable);
3196
3197 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3198 /* IVB does typed surface access through the render cache, so we need to
3199 * flush it too. Use a different register so both flushes can be
3200 * pipelined by the hardware.
3201 */
3202 insn = next_insn(p, BRW_OPCODE_SEND);
3203 brw_set_dest(p, insn, offset(dst, 1));
3204 brw_set_src0(p, insn, offset(dst, 1));
3205 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3206 commit_enable);
3207
3208 /* Now write the response of the second message into the response of the
3209 * first to trigger a pipeline stall -- This way future render and data
3210 * cache messages will be properly ordered with respect to past data and
3211 * render cache messages.
3212 */
3213 brw_MOV(p, dst, offset(dst, 1));
3214 }
3215
3216 brw_pop_insn_state(p);
3217 }
3218
3219 void
3220 brw_pixel_interpolator_query(struct brw_codegen *p,
3221 struct brw_reg dest,
3222 struct brw_reg mrf,
3223 bool noperspective,
3224 unsigned mode,
3225 struct brw_reg data,
3226 unsigned msg_length,
3227 unsigned response_length)
3228 {
3229 const struct gen_device_info *devinfo = p->devinfo;
3230 struct brw_inst *insn;
3231 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3232
3233 /* brw_send_indirect_message will automatically use a direct send message
3234 * if data is actually immediate.
3235 */
3236 insn = brw_send_indirect_message(p,
3237 GEN7_SFID_PIXEL_INTERPOLATOR,
3238 dest,
3239 mrf,
3240 vec1(data));
3241 brw_inst_set_mlen(devinfo, insn, msg_length);
3242 brw_inst_set_rlen(devinfo, insn, response_length);
3243
3244 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3245 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3246 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3247 brw_inst_set_pi_message_type(devinfo, insn, mode);
3248 }
3249
3250 void
3251 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3252 struct brw_reg mask)
3253 {
3254 const struct gen_device_info *devinfo = p->devinfo;
3255 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3256 const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3257 brw_inst *inst;
3258
3259 assert(devinfo->gen >= 7);
3260 assert(mask.type == BRW_REGISTER_TYPE_UD);
3261
3262 brw_push_insn_state(p);
3263
3264 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3265 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3266
3267 if (devinfo->gen >= 8) {
3268 /* Getting the first active channel index is easy on Gen8: Just find
3269 * the first bit set in the execution mask. The register exists on
3270 * HSW already but it reads back as all ones when the current
3271 * instruction has execution masking disabled, so it's kind of
3272 * useless.
3273 */
3274 struct brw_reg exec_mask =
3275 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3276
3277 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3278 /* Unfortunately, ce0 does not take into account the thread
3279 * dispatch mask, which may be a problem in cases where it's not
3280 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3281 * some n). Combine ce0 with the given dispatch (or vector) mask
3282 * to mask off those channels which were never dispatched by the
3283 * hardware.
3284 */
3285 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3286 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3287 exec_mask = vec1(dst);
3288 }
3289
3290 /* Quarter control has the effect of magically shifting the value of
3291 * ce0 so you'll get the first active channel relative to the
3292 * specified quarter control as result.
3293 */
3294 inst = brw_FBL(p, vec1(dst), exec_mask);
3295 } else {
3296 const struct brw_reg flag = brw_flag_reg(1, 0);
3297
3298 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3299
3300 /* Run enough instructions returning zero with execution masking and
3301 * a conditional modifier enabled in order to get the full execution
3302 * mask in f1.0. We could use a single 32-wide move here if it
3303 * weren't because of the hardware bug that causes channel enables to
3304 * be applied incorrectly to the second half of 32-wide instructions
3305 * on Gen7.
3306 */
3307 const unsigned lower_size = MIN2(16, exec_size);
3308 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3309 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3310 brw_imm_uw(0));
3311 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3312 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3313 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3314 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3315 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3316 }
3317
3318 /* Find the first bit set in the exec_size-wide portion of the flag
3319 * register that was updated by the last sequence of MOV
3320 * instructions.
3321 */
3322 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3323 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3324 }
3325 } else {
3326 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3327
3328 if (devinfo->gen >= 8 &&
3329 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3330 /* In SIMD4x2 mode the first active channel index is just the
3331 * negation of the first bit of the mask register. Note that ce0
3332 * doesn't take into account the dispatch mask, so the Gen7 path
3333 * should be used instead unless you have the guarantee that the
3334 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3335 * for some n).
3336 */
3337 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3338 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3339 brw_imm_ud(1));
3340
3341 } else {
3342 /* Overwrite the destination without and with execution masking to
3343 * find out which of the channels is active.
3344 */
3345 brw_push_insn_state(p);
3346 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3347 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3348 brw_imm_ud(1));
3349
3350 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3351 brw_imm_ud(0));
3352 brw_pop_insn_state(p);
3353 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3354 }
3355 }
3356
3357 brw_pop_insn_state(p);
3358 }
3359
3360 void
3361 brw_broadcast(struct brw_codegen *p,
3362 struct brw_reg dst,
3363 struct brw_reg src,
3364 struct brw_reg idx)
3365 {
3366 const struct gen_device_info *devinfo = p->devinfo;
3367 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3368 brw_inst *inst;
3369
3370 brw_push_insn_state(p);
3371 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3372 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3373
3374 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3375 src.address_mode == BRW_ADDRESS_DIRECT);
3376 assert(!src.abs && !src.negate);
3377 assert(src.type == dst.type);
3378
3379 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3380 idx.file == BRW_IMMEDIATE_VALUE) {
3381 /* Trivial, the source is already uniform or the index is a constant.
3382 * We will typically not get here if the optimizer is doing its job, but
3383 * asserting would be mean.
3384 */
3385 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3386 brw_MOV(p, dst,
3387 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3388 stride(suboffset(src, 4 * i), 0, 4, 1)));
3389 } else {
3390 /* From the Haswell PRM section "Register Region Restrictions":
3391 *
3392 * "The lower bits of the AddressImmediate must not overflow to
3393 * change the register address. The lower 5 bits of Address
3394 * Immediate when added to lower 5 bits of address register gives
3395 * the sub-register offset. The upper bits of Address Immediate
3396 * when added to upper bits of address register gives the register
3397 * address. Any overflow from sub-register offset is dropped."
3398 *
3399 * Fortunately, for broadcast, we never have a sub-register offset so
3400 * this isn't an issue.
3401 */
3402 assert(src.subnr == 0);
3403
3404 if (align1) {
3405 const struct brw_reg addr =
3406 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3407 unsigned offset = src.nr * REG_SIZE + src.subnr;
3408 /* Limit in bytes of the signed indirect addressing immediate. */
3409 const unsigned limit = 512;
3410
3411 brw_push_insn_state(p);
3412 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3413 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3414
3415 /* Take into account the component size and horizontal stride. */
3416 assert(src.vstride == src.hstride + src.width);
3417 brw_SHL(p, addr, vec1(idx),
3418 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3419 src.hstride - 1));
3420
3421 /* We can only address up to limit bytes using the indirect
3422 * addressing immediate, account for the difference if the source
3423 * register is above this limit.
3424 */
3425 if (offset >= limit) {
3426 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3427 offset = offset % limit;
3428 }
3429
3430 brw_pop_insn_state(p);
3431
3432 /* Use indirect addressing to fetch the specified component. */
3433 if (type_sz(src.type) > 4 &&
3434 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3435 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3436 *
3437 * "When source or destination datatype is 64b or operation is
3438 * integer DWord multiply, indirect addressing must not be
3439 * used."
3440 *
3441 * To work around both of this issue, we do two integer MOVs
3442 * insead of one 64-bit MOV. Because no double value should ever
3443 * cross a register boundary, it's safe to use the immediate
3444 * offset in the indirect here to handle adding 4 bytes to the
3445 * offset and avoid the extra ADD to the register file.
3446 */
3447 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3448 retype(brw_vec1_indirect(addr.subnr, offset),
3449 BRW_REGISTER_TYPE_D));
3450 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3451 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3452 BRW_REGISTER_TYPE_D));
3453 } else {
3454 brw_MOV(p, dst,
3455 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3456 }
3457 } else {
3458 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3459 * to all bits of a flag register,
3460 */
3461 inst = brw_MOV(p,
3462 brw_null_reg(),
3463 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3464 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3465 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3466 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3467
3468 /* and use predicated SEL to pick the right channel. */
3469 inst = brw_SEL(p, dst,
3470 stride(suboffset(src, 4), 4, 4, 1),
3471 stride(src, 4, 4, 1));
3472 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3473 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3474 }
3475 }
3476
3477 brw_pop_insn_state(p);
3478 }
3479
3480 /**
3481 * This instruction is generated as a single-channel align1 instruction by
3482 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3483 *
3484 * We can't use the typed atomic op in the FS because that has the execution
3485 * mask ANDed with the pixel mask, but we just want to write the one dword for
3486 * all the pixels.
3487 *
3488 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3489 * one u32. So we use the same untyped atomic write message as the pixel
3490 * shader.
3491 *
3492 * The untyped atomic operation requires a BUFFER surface type with RAW
3493 * format, and is only accessible through the legacy DATA_CACHE dataport
3494 * messages.
3495 */
3496 void brw_shader_time_add(struct brw_codegen *p,
3497 struct brw_reg payload,
3498 uint32_t surf_index)
3499 {
3500 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3501 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3502 GEN7_SFID_DATAPORT_DATA_CACHE);
3503 assert(p->devinfo->gen >= 7);
3504
3505 brw_push_insn_state(p);
3506 brw_set_default_access_mode(p, BRW_ALIGN_1);
3507 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3508 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3509 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3510
3511 /* We use brw_vec1_reg and unmasked because we want to increment the given
3512 * offset only once.
3513 */
3514 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3515 BRW_ARF_NULL, 0));
3516 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3517 payload.nr, 0));
3518 brw_set_src1(p, send, brw_imm_ud(0));
3519 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3520 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3521 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3522
3523 brw_pop_insn_state(p);
3524 }
3525
3526
3527 /**
3528 * Emit the SEND message for a barrier
3529 */
3530 void
3531 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3532 {
3533 const struct gen_device_info *devinfo = p->devinfo;
3534 struct brw_inst *inst;
3535
3536 assert(devinfo->gen >= 7);
3537
3538 brw_push_insn_state(p);
3539 brw_set_default_access_mode(p, BRW_ALIGN_1);
3540 inst = next_insn(p, BRW_OPCODE_SEND);
3541 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3542 brw_set_src0(p, inst, src);
3543 brw_set_src1(p, inst, brw_null_reg());
3544
3545 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3546 1 /* msg_length */,
3547 0 /* response_length */,
3548 false /* header_present */,
3549 false /* end_of_thread */);
3550
3551 brw_inst_set_gateway_notify(devinfo, inst, 1);
3552 brw_inst_set_gateway_subfuncid(devinfo, inst,
3553 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3554
3555 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3556 brw_pop_insn_state(p);
3557 }
3558
3559
3560 /**
3561 * Emit the wait instruction for a barrier
3562 */
3563 void
3564 brw_WAIT(struct brw_codegen *p)
3565 {
3566 const struct gen_device_info *devinfo = p->devinfo;
3567 struct brw_inst *insn;
3568
3569 struct brw_reg src = brw_notification_reg();
3570
3571 insn = next_insn(p, BRW_OPCODE_WAIT);
3572 brw_set_dest(p, insn, src);
3573 brw_set_src0(p, insn, src);
3574 brw_set_src1(p, insn, brw_null_reg());
3575
3576 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3577 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3578 }