intel/eu: Fix pixel interpolator queries for SIMD32.
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 gen7_convert_mrf_to_grf(p, &dest);
98
99 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
100 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
101
102 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
103 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
104
105 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
106 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
107 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
108 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
109 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
110 } else {
111 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
112 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
113 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_MESSAGE_REGISTER_FILE) {
115 assert(dest.writemask != 0);
116 }
117 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
118 * Although Dst.HorzStride is a don't care for Align16, HW needs
119 * this to be programmed as "01".
120 */
121 brw_inst_set_dst_hstride(devinfo, inst, 1);
122 }
123 } else {
124 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
125
126 /* These are different sizes in align1 vs align16:
127 */
128 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
129 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
130 dest.indirect_offset);
131 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
134 } else {
135 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
136 dest.indirect_offset);
137 /* even ignored in da16, still need to set as '01' */
138 brw_inst_set_dst_hstride(devinfo, inst, 1);
139 }
140 }
141
142 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
143 * or 16 (SIMD16), as that's normally correct. However, when dealing with
144 * small registers, it can be useful for us to automatically reduce it to
145 * match the register size.
146 */
147 if (p->automatic_exec_sizes) {
148 /*
149 * In platforms that support fp64 we can emit instructions with a width
150 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
151 * these cases we need to make sure that these instructions have their
152 * exec sizes set properly when they are emitted and we can't rely on
153 * this code to fix it.
154 */
155 bool fix_exec_size;
156 if (devinfo->gen >= 6)
157 fix_exec_size = dest.width < BRW_EXECUTE_4;
158 else
159 fix_exec_size = dest.width < BRW_EXECUTE_8;
160
161 if (fix_exec_size)
162 brw_inst_set_exec_size(devinfo, inst, dest.width);
163 }
164 }
165
166 void
167 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
168 {
169 const struct gen_device_info *devinfo = p->devinfo;
170
171 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
172 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
173 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
174 assert(reg.nr < 128);
175
176 gen7_convert_mrf_to_grf(p, &reg);
177
178 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
179 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
180 /* Any source modifiers or regions will be ignored, since this just
181 * identifies the MRF/GRF to start reading the message contents from.
182 * Check for some likely failures.
183 */
184 assert(!reg.negate);
185 assert(!reg.abs);
186 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
187 }
188
189 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
190 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
191 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
192 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
193
194 if (reg.file == BRW_IMMEDIATE_VALUE) {
195 if (reg.type == BRW_REGISTER_TYPE_DF ||
196 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
197 brw_inst_set_imm_df(devinfo, inst, reg.df);
198 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
199 reg.type == BRW_REGISTER_TYPE_Q)
200 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
201 else
202 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
203
204 if (type_sz(reg.type) < 8) {
205 brw_inst_set_src1_reg_file(devinfo, inst,
206 BRW_ARCHITECTURE_REGISTER_FILE);
207 brw_inst_set_src1_reg_hw_type(devinfo, inst,
208 brw_inst_src0_reg_hw_type(devinfo, inst));
209 }
210 } else {
211 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
212 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
213 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
214 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
215 } else {
216 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
217 }
218 } else {
219 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
220
221 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
222 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
223 } else {
224 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
225 }
226 }
227
228 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
229 if (reg.width == BRW_WIDTH_1 &&
230 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
231 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
232 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
233 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
234 } else {
235 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
236 brw_inst_set_src0_width(devinfo, inst, reg.width);
237 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
238 }
239 } else {
240 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
241 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
242 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
243 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
244 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
245 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
246 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
247 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
248
249 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
250 /* This is an oddity of the fact we're using the same
251 * descriptions for registers in align_16 as align_1:
252 */
253 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
254 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
255 reg.type == BRW_REGISTER_TYPE_DF &&
256 reg.vstride == BRW_VERTICAL_STRIDE_2) {
257 /* From SNB PRM:
258 *
259 * "For Align16 access mode, only encodings of 0000 and 0011
260 * are allowed. Other codes are reserved."
261 *
262 * Presumably the DevSNB behavior applies to IVB as well.
263 */
264 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
265 } else {
266 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
267 }
268 }
269 }
270 }
271
272
273 void
274 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
275 {
276 const struct gen_device_info *devinfo = p->devinfo;
277
278 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
279 assert(reg.nr < 128);
280
281 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
282 *
283 * "Accumulator registers may be accessed explicitly as src0
284 * operands only."
285 */
286 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
287 reg.nr != BRW_ARF_ACCUMULATOR);
288
289 gen7_convert_mrf_to_grf(p, &reg);
290 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
291
292 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
293 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
294 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
295
296 /* Only src1 can be immediate in two-argument instructions.
297 */
298 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
299
300 if (reg.file == BRW_IMMEDIATE_VALUE) {
301 /* two-argument instructions can only use 32-bit immediates */
302 assert(type_sz(reg.type) < 8);
303 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
304 } else {
305 /* This is a hardware restriction, which may or may not be lifted
306 * in the future:
307 */
308 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
309 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
310
311 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
312 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
313 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
314 } else {
315 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
316 }
317
318 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
319 if (reg.width == BRW_WIDTH_1 &&
320 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
321 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
322 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
323 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
324 } else {
325 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
326 brw_inst_set_src1_width(devinfo, inst, reg.width);
327 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
328 }
329 } else {
330 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
331 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
332 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
333 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
334 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
335 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
336 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
337 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
338
339 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
340 /* This is an oddity of the fact we're using the same
341 * descriptions for registers in align_16 as align_1:
342 */
343 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
344 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
345 reg.type == BRW_REGISTER_TYPE_DF &&
346 reg.vstride == BRW_VERTICAL_STRIDE_2) {
347 /* From SNB PRM:
348 *
349 * "For Align16 access mode, only encodings of 0000 and 0011
350 * are allowed. Other codes are reserved."
351 *
352 * Presumably the DevSNB behavior applies to IVB as well.
353 */
354 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
355 } else {
356 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
357 }
358 }
359 }
360 }
361
362 /**
363 * Set the Message Descriptor and Extended Message Descriptor fields
364 * for SEND messages.
365 *
366 * \note This zeroes out the Function Control bits, so it must be called
367 * \b before filling out any message-specific data. Callers can
368 * choose not to fill in irrelevant bits; they will be zero.
369 */
370 void
371 brw_set_message_descriptor(struct brw_codegen *p,
372 brw_inst *inst,
373 enum brw_message_target sfid,
374 unsigned msg_length,
375 unsigned response_length,
376 bool header_present,
377 bool end_of_thread)
378 {
379 const struct gen_device_info *devinfo = p->devinfo;
380
381 brw_set_src1(p, inst, brw_imm_d(0));
382
383 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
384 * itself; instead, it will be a MOV/OR into the address register.
385 *
386 * In this case, we avoid setting the extended message descriptor bits,
387 * since they go on the later SEND/SENDC instead and if set here would
388 * instead clobber the conditionalmod bits.
389 */
390 unsigned opcode = brw_inst_opcode(devinfo, inst);
391 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
392 brw_inst_set_sfid(devinfo, inst, sfid);
393 }
394
395 brw_inst_set_mlen(devinfo, inst, msg_length);
396 brw_inst_set_rlen(devinfo, inst, response_length);
397 brw_inst_set_eot(devinfo, inst, end_of_thread);
398
399 if (devinfo->gen >= 5) {
400 brw_inst_set_header_present(devinfo, inst, header_present);
401 }
402 }
403
404 static void brw_set_math_message( struct brw_codegen *p,
405 brw_inst *inst,
406 unsigned function,
407 unsigned integer_type,
408 bool low_precision,
409 unsigned dataType )
410 {
411 const struct gen_device_info *devinfo = p->devinfo;
412 unsigned msg_length;
413 unsigned response_length;
414
415 /* Infer message length from the function */
416 switch (function) {
417 case BRW_MATH_FUNCTION_POW:
418 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
419 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
420 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
421 msg_length = 2;
422 break;
423 default:
424 msg_length = 1;
425 break;
426 }
427
428 /* Infer response length from the function */
429 switch (function) {
430 case BRW_MATH_FUNCTION_SINCOS:
431 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
432 response_length = 2;
433 break;
434 default:
435 response_length = 1;
436 break;
437 }
438
439
440 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
441 msg_length, response_length, false, false);
442 brw_inst_set_math_msg_function(devinfo, inst, function);
443 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
444 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
445 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
446 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
447 brw_inst_set_saturate(devinfo, inst, 0);
448 }
449
450
451 static void brw_set_ff_sync_message(struct brw_codegen *p,
452 brw_inst *insn,
453 bool allocate,
454 unsigned response_length,
455 bool end_of_thread)
456 {
457 const struct gen_device_info *devinfo = p->devinfo;
458
459 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
460 1, response_length, true, end_of_thread);
461 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
462 brw_inst_set_urb_allocate(devinfo, insn, allocate);
463 /* The following fields are not used by FF_SYNC: */
464 brw_inst_set_urb_global_offset(devinfo, insn, 0);
465 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
466 brw_inst_set_urb_used(devinfo, insn, 0);
467 brw_inst_set_urb_complete(devinfo, insn, 0);
468 }
469
470 static void brw_set_urb_message( struct brw_codegen *p,
471 brw_inst *insn,
472 enum brw_urb_write_flags flags,
473 unsigned msg_length,
474 unsigned response_length,
475 unsigned offset,
476 unsigned swizzle_control )
477 {
478 const struct gen_device_info *devinfo = p->devinfo;
479
480 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
481 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
482 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
483
484 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
485 msg_length, response_length, true,
486 flags & BRW_URB_WRITE_EOT);
487
488 if (flags & BRW_URB_WRITE_OWORD) {
489 assert(msg_length == 2); /* header + one OWORD of data */
490 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
491 } else {
492 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
493 }
494
495 brw_inst_set_urb_global_offset(devinfo, insn, offset);
496 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
497
498 if (devinfo->gen < 8) {
499 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
500 }
501
502 if (devinfo->gen < 7) {
503 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
504 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
505 } else {
506 brw_inst_set_urb_per_slot_offset(devinfo, insn,
507 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
508 }
509 }
510
511 void
512 brw_set_dp_write_message(struct brw_codegen *p,
513 brw_inst *insn,
514 unsigned binding_table_index,
515 unsigned msg_control,
516 unsigned msg_type,
517 unsigned target_cache,
518 unsigned msg_length,
519 bool header_present,
520 unsigned last_render_target,
521 unsigned response_length,
522 unsigned end_of_thread,
523 unsigned send_commit_msg)
524 {
525 const struct gen_device_info *devinfo = p->devinfo;
526 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
527 BRW_SFID_DATAPORT_WRITE);
528
529 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
530 header_present, end_of_thread);
531
532 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
533 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
534 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
535 brw_inst_set_rt_last(devinfo, insn, last_render_target);
536 if (devinfo->gen < 7) {
537 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
538 }
539
540 if (devinfo->gen >= 11)
541 brw_inst_set_null_rt(devinfo, insn, false);
542 }
543
544 void
545 brw_set_dp_read_message(struct brw_codegen *p,
546 brw_inst *insn,
547 unsigned binding_table_index,
548 unsigned msg_control,
549 unsigned msg_type,
550 unsigned target_cache,
551 unsigned msg_length,
552 bool header_present,
553 unsigned response_length)
554 {
555 const struct gen_device_info *devinfo = p->devinfo;
556 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
557 BRW_SFID_DATAPORT_READ);
558
559 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
560 header_present, false);
561
562 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
563 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
564 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
565 if (devinfo->gen < 6)
566 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
567 }
568
569 void
570 brw_set_sampler_message(struct brw_codegen *p,
571 brw_inst *inst,
572 unsigned binding_table_index,
573 unsigned sampler,
574 unsigned msg_type,
575 unsigned response_length,
576 unsigned msg_length,
577 unsigned header_present,
578 unsigned simd_mode,
579 unsigned return_format)
580 {
581 const struct gen_device_info *devinfo = p->devinfo;
582
583 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
584 response_length, header_present, false);
585
586 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
587 brw_inst_set_sampler(devinfo, inst, sampler);
588 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
589 if (devinfo->gen >= 5) {
590 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
591 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
592 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
593 }
594 }
595
596 static void
597 gen7_set_dp_scratch_message(struct brw_codegen *p,
598 brw_inst *inst,
599 bool write,
600 bool dword,
601 bool invalidate_after_read,
602 unsigned num_regs,
603 unsigned addr_offset,
604 unsigned mlen,
605 unsigned rlen,
606 bool header_present)
607 {
608 const struct gen_device_info *devinfo = p->devinfo;
609 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
610 (devinfo->gen >= 8 && num_regs == 8));
611 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
612 num_regs - 1);
613
614 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
615 mlen, rlen, header_present, false);
616 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
617 brw_inst_set_scratch_read_write(devinfo, inst, write);
618 brw_inst_set_scratch_type(devinfo, inst, dword);
619 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
620 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
621 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
622 }
623
624 static void
625 brw_inst_set_state(const struct gen_device_info *devinfo,
626 brw_inst *insn,
627 const struct brw_insn_state *state)
628 {
629 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
630 brw_inst_set_group(devinfo, insn, state->group);
631 brw_inst_set_compression(devinfo, insn, state->compressed);
632 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
633 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
634 brw_inst_set_saturate(devinfo, insn, state->saturate);
635 brw_inst_set_pred_control(devinfo, insn, state->predicate);
636 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
637
638 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
639 state->access_mode == BRW_ALIGN_16) {
640 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
641 if (devinfo->gen >= 7)
642 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
643 } else {
644 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
645 if (devinfo->gen >= 7)
646 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
647 }
648
649 if (devinfo->gen >= 6)
650 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
651 }
652
653 #define next_insn brw_next_insn
654 brw_inst *
655 brw_next_insn(struct brw_codegen *p, unsigned opcode)
656 {
657 const struct gen_device_info *devinfo = p->devinfo;
658 brw_inst *insn;
659
660 if (p->nr_insn + 1 > p->store_size) {
661 p->store_size <<= 1;
662 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
663 }
664
665 p->next_insn_offset += 16;
666 insn = &p->store[p->nr_insn++];
667
668 memset(insn, 0, sizeof(*insn));
669 brw_inst_set_opcode(devinfo, insn, opcode);
670
671 /* Apply the default instruction state */
672 brw_inst_set_state(devinfo, insn, p->current);
673
674 return insn;
675 }
676
677 static brw_inst *
678 brw_alu1(struct brw_codegen *p, unsigned opcode,
679 struct brw_reg dest, struct brw_reg src)
680 {
681 brw_inst *insn = next_insn(p, opcode);
682 brw_set_dest(p, insn, dest);
683 brw_set_src0(p, insn, src);
684 return insn;
685 }
686
687 static brw_inst *
688 brw_alu2(struct brw_codegen *p, unsigned opcode,
689 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
690 {
691 /* 64-bit immediates are only supported on 1-src instructions */
692 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
693 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
694
695 brw_inst *insn = next_insn(p, opcode);
696 brw_set_dest(p, insn, dest);
697 brw_set_src0(p, insn, src0);
698 brw_set_src1(p, insn, src1);
699 return insn;
700 }
701
702 static int
703 get_3src_subreg_nr(struct brw_reg reg)
704 {
705 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
706 * use 32-bit units (components 0..7). Since they only support F/D/UD
707 * types, this doesn't lose any flexibility, but uses fewer bits.
708 */
709 return reg.subnr / 4;
710 }
711
712 static enum gen10_align1_3src_vertical_stride
713 to_3src_align1_vstride(enum brw_vertical_stride vstride)
714 {
715 switch (vstride) {
716 case BRW_VERTICAL_STRIDE_0:
717 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
718 case BRW_VERTICAL_STRIDE_2:
719 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
720 case BRW_VERTICAL_STRIDE_4:
721 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
722 case BRW_VERTICAL_STRIDE_8:
723 case BRW_VERTICAL_STRIDE_16:
724 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
725 default:
726 unreachable("invalid vstride");
727 }
728 }
729
730
731 static enum gen10_align1_3src_src_horizontal_stride
732 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
733 {
734 switch (hstride) {
735 case BRW_HORIZONTAL_STRIDE_0:
736 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
737 case BRW_HORIZONTAL_STRIDE_1:
738 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
739 case BRW_HORIZONTAL_STRIDE_2:
740 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
741 case BRW_HORIZONTAL_STRIDE_4:
742 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
743 default:
744 unreachable("invalid hstride");
745 }
746 }
747
748 static brw_inst *
749 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
750 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
751 {
752 const struct gen_device_info *devinfo = p->devinfo;
753 brw_inst *inst = next_insn(p, opcode);
754
755 gen7_convert_mrf_to_grf(p, &dest);
756
757 assert(dest.nr < 128);
758 assert(src0.nr < 128);
759 assert(src1.nr < 128);
760 assert(src2.nr < 128);
761 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
762 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
763 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
764 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
765
766 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
767 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
768 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
769
770 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
771 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
772 BRW_ALIGN1_3SRC_ACCUMULATOR);
773 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
774 } else {
775 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
776 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
777 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
778 }
779 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
780
781 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
782
783 if (brw_reg_type_is_floating_point(dest.type)) {
784 brw_inst_set_3src_a1_exec_type(devinfo, inst,
785 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
786 } else {
787 brw_inst_set_3src_a1_exec_type(devinfo, inst,
788 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
789 }
790
791 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
792 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
793 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
794 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
795
796 brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
797 to_3src_align1_vstride(src0.vstride));
798 brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
799 to_3src_align1_vstride(src1.vstride));
800 /* no vstride on src2 */
801
802 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
803 to_3src_align1_hstride(src0.hstride));
804 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
805 to_3src_align1_hstride(src1.hstride));
806 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
807 to_3src_align1_hstride(src2.hstride));
808
809 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
810 if (src0.type == BRW_REGISTER_TYPE_NF) {
811 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
812 } else {
813 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
814 }
815 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
816 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
817
818 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
819 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
820 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
821 } else {
822 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
823 }
824 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
825 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
826
827 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
828 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
829 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
830 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
831
832 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
833 src0.file == BRW_IMMEDIATE_VALUE ||
834 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
835 src0.type == BRW_REGISTER_TYPE_NF));
836 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
837 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
838 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
839 src2.file == BRW_IMMEDIATE_VALUE);
840
841 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
842 src0.file == BRW_GENERAL_REGISTER_FILE ?
843 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
844 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
845 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
846 src1.file == BRW_GENERAL_REGISTER_FILE ?
847 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
848 BRW_ALIGN1_3SRC_ACCUMULATOR);
849 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
850 src2.file == BRW_GENERAL_REGISTER_FILE ?
851 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
852 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
853 } else {
854 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
855 dest.file == BRW_MESSAGE_REGISTER_FILE);
856 assert(dest.type == BRW_REGISTER_TYPE_F ||
857 dest.type == BRW_REGISTER_TYPE_DF ||
858 dest.type == BRW_REGISTER_TYPE_D ||
859 dest.type == BRW_REGISTER_TYPE_UD);
860 if (devinfo->gen == 6) {
861 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
862 dest.file == BRW_MESSAGE_REGISTER_FILE);
863 }
864 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
865 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
866 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
867
868 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
869 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
870 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
871 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
872 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
873 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
874 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
875 src0.vstride == BRW_VERTICAL_STRIDE_0);
876
877 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
878 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
879 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
880 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
881 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
882 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
883 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
884 src1.vstride == BRW_VERTICAL_STRIDE_0);
885
886 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
887 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
888 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
889 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
890 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
891 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
892 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
893 src2.vstride == BRW_VERTICAL_STRIDE_0);
894
895 if (devinfo->gen >= 7) {
896 /* Set both the source and destination types based on dest.type,
897 * ignoring the source register types. The MAD and LRP emitters ensure
898 * that all four types are float. The BFE and BFI2 emitters, however,
899 * may send us mixed D and UD types and want us to ignore that and use
900 * the destination type.
901 */
902 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
903 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
904 }
905 }
906
907 return inst;
908 }
909
910
911 /***********************************************************************
912 * Convenience routines.
913 */
914 #define ALU1(OP) \
915 brw_inst *brw_##OP(struct brw_codegen *p, \
916 struct brw_reg dest, \
917 struct brw_reg src0) \
918 { \
919 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
920 }
921
922 #define ALU2(OP) \
923 brw_inst *brw_##OP(struct brw_codegen *p, \
924 struct brw_reg dest, \
925 struct brw_reg src0, \
926 struct brw_reg src1) \
927 { \
928 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
929 }
930
931 #define ALU3(OP) \
932 brw_inst *brw_##OP(struct brw_codegen *p, \
933 struct brw_reg dest, \
934 struct brw_reg src0, \
935 struct brw_reg src1, \
936 struct brw_reg src2) \
937 { \
938 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
939 }
940
941 #define ALU3F(OP) \
942 brw_inst *brw_##OP(struct brw_codegen *p, \
943 struct brw_reg dest, \
944 struct brw_reg src0, \
945 struct brw_reg src1, \
946 struct brw_reg src2) \
947 { \
948 assert(dest.type == BRW_REGISTER_TYPE_F || \
949 dest.type == BRW_REGISTER_TYPE_DF); \
950 if (dest.type == BRW_REGISTER_TYPE_F) { \
951 assert(src0.type == BRW_REGISTER_TYPE_F); \
952 assert(src1.type == BRW_REGISTER_TYPE_F); \
953 assert(src2.type == BRW_REGISTER_TYPE_F); \
954 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
955 assert(src0.type == BRW_REGISTER_TYPE_DF); \
956 assert(src1.type == BRW_REGISTER_TYPE_DF); \
957 assert(src2.type == BRW_REGISTER_TYPE_DF); \
958 } \
959 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
960 }
961
962 /* Rounding operations (other than RNDD) require two instructions - the first
963 * stores a rounded value (possibly the wrong way) in the dest register, but
964 * also sets a per-channel "increment bit" in the flag register. A predicated
965 * add of 1.0 fixes dest to contain the desired result.
966 *
967 * Sandybridge and later appear to round correctly without an ADD.
968 */
969 #define ROUND(OP) \
970 void brw_##OP(struct brw_codegen *p, \
971 struct brw_reg dest, \
972 struct brw_reg src) \
973 { \
974 const struct gen_device_info *devinfo = p->devinfo; \
975 brw_inst *rnd, *add; \
976 rnd = next_insn(p, BRW_OPCODE_##OP); \
977 brw_set_dest(p, rnd, dest); \
978 brw_set_src0(p, rnd, src); \
979 \
980 if (devinfo->gen < 6) { \
981 /* turn on round-increments */ \
982 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
983 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
984 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
985 } \
986 }
987
988
989 ALU2(SEL)
990 ALU1(NOT)
991 ALU2(AND)
992 ALU2(OR)
993 ALU2(XOR)
994 ALU2(SHR)
995 ALU2(SHL)
996 ALU1(DIM)
997 ALU2(ASR)
998 ALU3(CSEL)
999 ALU1(FRC)
1000 ALU1(RNDD)
1001 ALU2(MAC)
1002 ALU2(MACH)
1003 ALU1(LZD)
1004 ALU2(DP4)
1005 ALU2(DPH)
1006 ALU2(DP3)
1007 ALU2(DP2)
1008 ALU3(MAD)
1009 ALU3F(LRP)
1010 ALU1(BFREV)
1011 ALU3(BFE)
1012 ALU2(BFI1)
1013 ALU3(BFI2)
1014 ALU1(FBH)
1015 ALU1(FBL)
1016 ALU1(CBIT)
1017 ALU2(ADDC)
1018 ALU2(SUBB)
1019
1020 ROUND(RNDZ)
1021 ROUND(RNDE)
1022
1023 brw_inst *
1024 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1025 {
1026 const struct gen_device_info *devinfo = p->devinfo;
1027
1028 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1029 * To avoid the problems that causes, we use a <1,2,0> source region to read
1030 * each element twice.
1031 */
1032 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1033 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1034 dest.type == BRW_REGISTER_TYPE_DF &&
1035 (src0.type == BRW_REGISTER_TYPE_F ||
1036 src0.type == BRW_REGISTER_TYPE_D ||
1037 src0.type == BRW_REGISTER_TYPE_UD) &&
1038 !has_scalar_region(src0)) {
1039 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
1040 src0.width == BRW_WIDTH_4 &&
1041 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1042
1043 src0.vstride = BRW_VERTICAL_STRIDE_1;
1044 src0.width = BRW_WIDTH_2;
1045 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1046 }
1047
1048 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1049 }
1050
1051 brw_inst *
1052 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1053 struct brw_reg src0, struct brw_reg src1)
1054 {
1055 /* 6.2.2: add */
1056 if (src0.type == BRW_REGISTER_TYPE_F ||
1057 (src0.file == BRW_IMMEDIATE_VALUE &&
1058 src0.type == BRW_REGISTER_TYPE_VF)) {
1059 assert(src1.type != BRW_REGISTER_TYPE_UD);
1060 assert(src1.type != BRW_REGISTER_TYPE_D);
1061 }
1062
1063 if (src1.type == BRW_REGISTER_TYPE_F ||
1064 (src1.file == BRW_IMMEDIATE_VALUE &&
1065 src1.type == BRW_REGISTER_TYPE_VF)) {
1066 assert(src0.type != BRW_REGISTER_TYPE_UD);
1067 assert(src0.type != BRW_REGISTER_TYPE_D);
1068 }
1069
1070 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1071 }
1072
1073 brw_inst *
1074 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1075 struct brw_reg src0, struct brw_reg src1)
1076 {
1077 assert(dest.type == src0.type);
1078 assert(src0.type == src1.type);
1079 switch (src0.type) {
1080 case BRW_REGISTER_TYPE_B:
1081 case BRW_REGISTER_TYPE_UB:
1082 case BRW_REGISTER_TYPE_W:
1083 case BRW_REGISTER_TYPE_UW:
1084 case BRW_REGISTER_TYPE_D:
1085 case BRW_REGISTER_TYPE_UD:
1086 break;
1087 default:
1088 unreachable("Bad type for brw_AVG");
1089 }
1090
1091 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1092 }
1093
1094 brw_inst *
1095 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1096 struct brw_reg src0, struct brw_reg src1)
1097 {
1098 /* 6.32.38: mul */
1099 if (src0.type == BRW_REGISTER_TYPE_D ||
1100 src0.type == BRW_REGISTER_TYPE_UD ||
1101 src1.type == BRW_REGISTER_TYPE_D ||
1102 src1.type == BRW_REGISTER_TYPE_UD) {
1103 assert(dest.type != BRW_REGISTER_TYPE_F);
1104 }
1105
1106 if (src0.type == BRW_REGISTER_TYPE_F ||
1107 (src0.file == BRW_IMMEDIATE_VALUE &&
1108 src0.type == BRW_REGISTER_TYPE_VF)) {
1109 assert(src1.type != BRW_REGISTER_TYPE_UD);
1110 assert(src1.type != BRW_REGISTER_TYPE_D);
1111 }
1112
1113 if (src1.type == BRW_REGISTER_TYPE_F ||
1114 (src1.file == BRW_IMMEDIATE_VALUE &&
1115 src1.type == BRW_REGISTER_TYPE_VF)) {
1116 assert(src0.type != BRW_REGISTER_TYPE_UD);
1117 assert(src0.type != BRW_REGISTER_TYPE_D);
1118 }
1119
1120 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1121 src0.nr != BRW_ARF_ACCUMULATOR);
1122 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1123 src1.nr != BRW_ARF_ACCUMULATOR);
1124
1125 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1126 }
1127
1128 brw_inst *
1129 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1130 struct brw_reg src0, struct brw_reg src1)
1131 {
1132 src0.vstride = BRW_VERTICAL_STRIDE_0;
1133 src0.width = BRW_WIDTH_1;
1134 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1135 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1136 }
1137
1138 brw_inst *
1139 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1140 struct brw_reg src0, struct brw_reg src1)
1141 {
1142 src0.vstride = BRW_VERTICAL_STRIDE_0;
1143 src0.width = BRW_WIDTH_1;
1144 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1145 src1.vstride = BRW_VERTICAL_STRIDE_8;
1146 src1.width = BRW_WIDTH_8;
1147 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1148 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1149 }
1150
1151 brw_inst *
1152 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1153 {
1154 const struct gen_device_info *devinfo = p->devinfo;
1155 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1156 /* The F32TO16 instruction doesn't support 32-bit destination types in
1157 * Align1 mode, and neither does the Gen8 implementation in terms of a
1158 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1159 * an undocumented feature.
1160 */
1161 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1162 (!align16 || devinfo->gen >= 8));
1163 brw_inst *inst;
1164
1165 if (align16) {
1166 assert(dst.type == BRW_REGISTER_TYPE_UD);
1167 } else {
1168 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1169 dst.type == BRW_REGISTER_TYPE_W ||
1170 dst.type == BRW_REGISTER_TYPE_UW ||
1171 dst.type == BRW_REGISTER_TYPE_HF);
1172 }
1173
1174 brw_push_insn_state(p);
1175
1176 if (needs_zero_fill) {
1177 brw_set_default_access_mode(p, BRW_ALIGN_1);
1178 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1179 }
1180
1181 if (devinfo->gen >= 8) {
1182 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1183 } else {
1184 assert(devinfo->gen == 7);
1185 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1186 }
1187
1188 if (needs_zero_fill) {
1189 brw_inst_set_no_dd_clear(devinfo, inst, true);
1190 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1191 brw_inst_set_no_dd_check(devinfo, inst, true);
1192 }
1193
1194 brw_pop_insn_state(p);
1195 return inst;
1196 }
1197
1198 brw_inst *
1199 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1200 {
1201 const struct gen_device_info *devinfo = p->devinfo;
1202 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1203
1204 if (align16) {
1205 assert(src.type == BRW_REGISTER_TYPE_UD);
1206 } else {
1207 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1208 *
1209 * Because this instruction does not have a 16-bit floating-point
1210 * type, the source data type must be Word (W). The destination type
1211 * must be F (Float).
1212 */
1213 if (src.type == BRW_REGISTER_TYPE_UD)
1214 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1215
1216 assert(src.type == BRW_REGISTER_TYPE_W ||
1217 src.type == BRW_REGISTER_TYPE_UW ||
1218 src.type == BRW_REGISTER_TYPE_HF);
1219 }
1220
1221 if (devinfo->gen >= 8) {
1222 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1223 } else {
1224 assert(devinfo->gen == 7);
1225 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1226 }
1227 }
1228
1229
1230 void brw_NOP(struct brw_codegen *p)
1231 {
1232 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1233 memset(insn, 0, sizeof(*insn));
1234 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1235 }
1236
1237
1238
1239
1240
1241 /***********************************************************************
1242 * Comparisons, if/else/endif
1243 */
1244
1245 brw_inst *
1246 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1247 unsigned predicate_control)
1248 {
1249 const struct gen_device_info *devinfo = p->devinfo;
1250 struct brw_reg ip = brw_ip_reg();
1251 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1252
1253 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1254 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1255 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1256 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1257
1258 return inst;
1259 }
1260
1261 static void
1262 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1263 {
1264 p->if_stack[p->if_stack_depth] = inst - p->store;
1265
1266 p->if_stack_depth++;
1267 if (p->if_stack_array_size <= p->if_stack_depth) {
1268 p->if_stack_array_size *= 2;
1269 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1270 p->if_stack_array_size);
1271 }
1272 }
1273
1274 static brw_inst *
1275 pop_if_stack(struct brw_codegen *p)
1276 {
1277 p->if_stack_depth--;
1278 return &p->store[p->if_stack[p->if_stack_depth]];
1279 }
1280
1281 static void
1282 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1283 {
1284 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1285 p->loop_stack_array_size *= 2;
1286 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1287 p->loop_stack_array_size);
1288 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1289 p->loop_stack_array_size);
1290 }
1291
1292 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1293 p->loop_stack_depth++;
1294 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1295 }
1296
1297 static brw_inst *
1298 get_inner_do_insn(struct brw_codegen *p)
1299 {
1300 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1301 }
1302
1303 /* EU takes the value from the flag register and pushes it onto some
1304 * sort of a stack (presumably merging with any flag value already on
1305 * the stack). Within an if block, the flags at the top of the stack
1306 * control execution on each channel of the unit, eg. on each of the
1307 * 16 pixel values in our wm programs.
1308 *
1309 * When the matching 'else' instruction is reached (presumably by
1310 * countdown of the instruction count patched in by our ELSE/ENDIF
1311 * functions), the relevant flags are inverted.
1312 *
1313 * When the matching 'endif' instruction is reached, the flags are
1314 * popped off. If the stack is now empty, normal execution resumes.
1315 */
1316 brw_inst *
1317 brw_IF(struct brw_codegen *p, unsigned execute_size)
1318 {
1319 const struct gen_device_info *devinfo = p->devinfo;
1320 brw_inst *insn;
1321
1322 insn = next_insn(p, BRW_OPCODE_IF);
1323
1324 /* Override the defaults for this instruction:
1325 */
1326 if (devinfo->gen < 6) {
1327 brw_set_dest(p, insn, brw_ip_reg());
1328 brw_set_src0(p, insn, brw_ip_reg());
1329 brw_set_src1(p, insn, brw_imm_d(0x0));
1330 } else if (devinfo->gen == 6) {
1331 brw_set_dest(p, insn, brw_imm_w(0));
1332 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1333 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1334 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1335 } else if (devinfo->gen == 7) {
1336 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1337 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1338 brw_set_src1(p, insn, brw_imm_w(0));
1339 brw_inst_set_jip(devinfo, insn, 0);
1340 brw_inst_set_uip(devinfo, insn, 0);
1341 } else {
1342 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1343 brw_set_src0(p, insn, brw_imm_d(0));
1344 brw_inst_set_jip(devinfo, insn, 0);
1345 brw_inst_set_uip(devinfo, insn, 0);
1346 }
1347
1348 brw_inst_set_exec_size(devinfo, insn, execute_size);
1349 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1350 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1351 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1352 if (!p->single_program_flow && devinfo->gen < 6)
1353 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1354
1355 push_if_stack(p, insn);
1356 p->if_depth_in_loop[p->loop_stack_depth]++;
1357 return insn;
1358 }
1359
1360 /* This function is only used for gen6-style IF instructions with an
1361 * embedded comparison (conditional modifier). It is not used on gen7.
1362 */
1363 brw_inst *
1364 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1365 struct brw_reg src0, struct brw_reg src1)
1366 {
1367 const struct gen_device_info *devinfo = p->devinfo;
1368 brw_inst *insn;
1369
1370 insn = next_insn(p, BRW_OPCODE_IF);
1371
1372 brw_set_dest(p, insn, brw_imm_w(0));
1373 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1374 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1375 brw_set_src0(p, insn, src0);
1376 brw_set_src1(p, insn, src1);
1377
1378 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1379 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1380 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1381
1382 push_if_stack(p, insn);
1383 return insn;
1384 }
1385
1386 /**
1387 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1388 */
1389 static void
1390 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1391 brw_inst *if_inst, brw_inst *else_inst)
1392 {
1393 const struct gen_device_info *devinfo = p->devinfo;
1394
1395 /* The next instruction (where the ENDIF would be, if it existed) */
1396 brw_inst *next_inst = &p->store[p->nr_insn];
1397
1398 assert(p->single_program_flow);
1399 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1400 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1401 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1402
1403 /* Convert IF to an ADD instruction that moves the instruction pointer
1404 * to the first instruction of the ELSE block. If there is no ELSE
1405 * block, point to where ENDIF would be. Reverse the predicate.
1406 *
1407 * There's no need to execute an ENDIF since we don't need to do any
1408 * stack operations, and if we're currently executing, we just want to
1409 * continue normally.
1410 */
1411 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1412 brw_inst_set_pred_inv(devinfo, if_inst, true);
1413
1414 if (else_inst != NULL) {
1415 /* Convert ELSE to an ADD instruction that points where the ENDIF
1416 * would be.
1417 */
1418 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1419
1420 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1421 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1422 } else {
1423 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1424 }
1425 }
1426
1427 /**
1428 * Patch IF and ELSE instructions with appropriate jump targets.
1429 */
1430 static void
1431 patch_IF_ELSE(struct brw_codegen *p,
1432 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1433 {
1434 const struct gen_device_info *devinfo = p->devinfo;
1435
1436 /* We shouldn't be patching IF and ELSE instructions in single program flow
1437 * mode when gen < 6, because in single program flow mode on those
1438 * platforms, we convert flow control instructions to conditional ADDs that
1439 * operate on IP (see brw_ENDIF).
1440 *
1441 * However, on Gen6, writing to IP doesn't work in single program flow mode
1442 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1443 * not be updated by non-flow control instructions."). And on later
1444 * platforms, there is no significant benefit to converting control flow
1445 * instructions to conditional ADDs. So we do patch IF and ELSE
1446 * instructions in single program flow mode on those platforms.
1447 */
1448 if (devinfo->gen < 6)
1449 assert(!p->single_program_flow);
1450
1451 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1452 assert(endif_inst != NULL);
1453 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1454
1455 unsigned br = brw_jump_scale(devinfo);
1456
1457 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1458 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1459
1460 if (else_inst == NULL) {
1461 /* Patch IF -> ENDIF */
1462 if (devinfo->gen < 6) {
1463 /* Turn it into an IFF, which means no mask stack operations for
1464 * all-false and jumping past the ENDIF.
1465 */
1466 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1467 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1468 br * (endif_inst - if_inst + 1));
1469 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1470 } else if (devinfo->gen == 6) {
1471 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1472 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1473 } else {
1474 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1475 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1476 }
1477 } else {
1478 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1479
1480 /* Patch IF -> ELSE */
1481 if (devinfo->gen < 6) {
1482 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1483 br * (else_inst - if_inst));
1484 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1485 } else if (devinfo->gen == 6) {
1486 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1487 br * (else_inst - if_inst + 1));
1488 }
1489
1490 /* Patch ELSE -> ENDIF */
1491 if (devinfo->gen < 6) {
1492 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1493 * matching ENDIF.
1494 */
1495 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1496 br * (endif_inst - else_inst + 1));
1497 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1498 } else if (devinfo->gen == 6) {
1499 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1500 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1501 br * (endif_inst - else_inst));
1502 } else {
1503 /* The IF instruction's JIP should point just past the ELSE */
1504 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1505 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1506 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1507 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1508 if (devinfo->gen >= 8) {
1509 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1510 * should point to ENDIF.
1511 */
1512 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1513 }
1514 }
1515 }
1516 }
1517
1518 void
1519 brw_ELSE(struct brw_codegen *p)
1520 {
1521 const struct gen_device_info *devinfo = p->devinfo;
1522 brw_inst *insn;
1523
1524 insn = next_insn(p, BRW_OPCODE_ELSE);
1525
1526 if (devinfo->gen < 6) {
1527 brw_set_dest(p, insn, brw_ip_reg());
1528 brw_set_src0(p, insn, brw_ip_reg());
1529 brw_set_src1(p, insn, brw_imm_d(0x0));
1530 } else if (devinfo->gen == 6) {
1531 brw_set_dest(p, insn, brw_imm_w(0));
1532 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1533 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1534 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1535 } else if (devinfo->gen == 7) {
1536 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1537 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1538 brw_set_src1(p, insn, brw_imm_w(0));
1539 brw_inst_set_jip(devinfo, insn, 0);
1540 brw_inst_set_uip(devinfo, insn, 0);
1541 } else {
1542 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1543 brw_set_src0(p, insn, brw_imm_d(0));
1544 brw_inst_set_jip(devinfo, insn, 0);
1545 brw_inst_set_uip(devinfo, insn, 0);
1546 }
1547
1548 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1549 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1550 if (!p->single_program_flow && devinfo->gen < 6)
1551 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1552
1553 push_if_stack(p, insn);
1554 }
1555
1556 void
1557 brw_ENDIF(struct brw_codegen *p)
1558 {
1559 const struct gen_device_info *devinfo = p->devinfo;
1560 brw_inst *insn = NULL;
1561 brw_inst *else_inst = NULL;
1562 brw_inst *if_inst = NULL;
1563 brw_inst *tmp;
1564 bool emit_endif = true;
1565
1566 /* In single program flow mode, we can express IF and ELSE instructions
1567 * equivalently as ADD instructions that operate on IP. On platforms prior
1568 * to Gen6, flow control instructions cause an implied thread switch, so
1569 * this is a significant savings.
1570 *
1571 * However, on Gen6, writing to IP doesn't work in single program flow mode
1572 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1573 * not be updated by non-flow control instructions."). And on later
1574 * platforms, there is no significant benefit to converting control flow
1575 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1576 * Gen5.
1577 */
1578 if (devinfo->gen < 6 && p->single_program_flow)
1579 emit_endif = false;
1580
1581 /*
1582 * A single next_insn() may change the base address of instruction store
1583 * memory(p->store), so call it first before referencing the instruction
1584 * store pointer from an index
1585 */
1586 if (emit_endif)
1587 insn = next_insn(p, BRW_OPCODE_ENDIF);
1588
1589 /* Pop the IF and (optional) ELSE instructions from the stack */
1590 p->if_depth_in_loop[p->loop_stack_depth]--;
1591 tmp = pop_if_stack(p);
1592 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1593 else_inst = tmp;
1594 tmp = pop_if_stack(p);
1595 }
1596 if_inst = tmp;
1597
1598 if (!emit_endif) {
1599 /* ENDIF is useless; don't bother emitting it. */
1600 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1601 return;
1602 }
1603
1604 if (devinfo->gen < 6) {
1605 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1606 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1607 brw_set_src1(p, insn, brw_imm_d(0x0));
1608 } else if (devinfo->gen == 6) {
1609 brw_set_dest(p, insn, brw_imm_w(0));
1610 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1611 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612 } else if (devinfo->gen == 7) {
1613 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1614 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1615 brw_set_src1(p, insn, brw_imm_w(0));
1616 } else {
1617 brw_set_src0(p, insn, brw_imm_d(0));
1618 }
1619
1620 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1621 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1622 if (devinfo->gen < 6)
1623 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1624
1625 /* Also pop item off the stack in the endif instruction: */
1626 if (devinfo->gen < 6) {
1627 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1628 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1629 } else if (devinfo->gen == 6) {
1630 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1631 } else {
1632 brw_inst_set_jip(devinfo, insn, 2);
1633 }
1634 patch_IF_ELSE(p, if_inst, else_inst, insn);
1635 }
1636
1637 brw_inst *
1638 brw_BREAK(struct brw_codegen *p)
1639 {
1640 const struct gen_device_info *devinfo = p->devinfo;
1641 brw_inst *insn;
1642
1643 insn = next_insn(p, BRW_OPCODE_BREAK);
1644 if (devinfo->gen >= 8) {
1645 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1646 brw_set_src0(p, insn, brw_imm_d(0x0));
1647 } else if (devinfo->gen >= 6) {
1648 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1649 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1650 brw_set_src1(p, insn, brw_imm_d(0x0));
1651 } else {
1652 brw_set_dest(p, insn, brw_ip_reg());
1653 brw_set_src0(p, insn, brw_ip_reg());
1654 brw_set_src1(p, insn, brw_imm_d(0x0));
1655 brw_inst_set_gen4_pop_count(devinfo, insn,
1656 p->if_depth_in_loop[p->loop_stack_depth]);
1657 }
1658 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1659 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1660
1661 return insn;
1662 }
1663
1664 brw_inst *
1665 brw_CONT(struct brw_codegen *p)
1666 {
1667 const struct gen_device_info *devinfo = p->devinfo;
1668 brw_inst *insn;
1669
1670 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1671 brw_set_dest(p, insn, brw_ip_reg());
1672 if (devinfo->gen >= 8) {
1673 brw_set_src0(p, insn, brw_imm_d(0x0));
1674 } else {
1675 brw_set_src0(p, insn, brw_ip_reg());
1676 brw_set_src1(p, insn, brw_imm_d(0x0));
1677 }
1678
1679 if (devinfo->gen < 6) {
1680 brw_inst_set_gen4_pop_count(devinfo, insn,
1681 p->if_depth_in_loop[p->loop_stack_depth]);
1682 }
1683 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1684 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1685 return insn;
1686 }
1687
1688 brw_inst *
1689 gen6_HALT(struct brw_codegen *p)
1690 {
1691 const struct gen_device_info *devinfo = p->devinfo;
1692 brw_inst *insn;
1693
1694 insn = next_insn(p, BRW_OPCODE_HALT);
1695 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1696 if (devinfo->gen >= 8) {
1697 brw_set_src0(p, insn, brw_imm_d(0x0));
1698 } else {
1699 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1700 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1701 }
1702
1703 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1704 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1705 return insn;
1706 }
1707
1708 /* DO/WHILE loop:
1709 *
1710 * The DO/WHILE is just an unterminated loop -- break or continue are
1711 * used for control within the loop. We have a few ways they can be
1712 * done.
1713 *
1714 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1715 * jip and no DO instruction.
1716 *
1717 * For non-uniform control flow pre-gen6, there's a DO instruction to
1718 * push the mask, and a WHILE to jump back, and BREAK to get out and
1719 * pop the mask.
1720 *
1721 * For gen6, there's no more mask stack, so no need for DO. WHILE
1722 * just points back to the first instruction of the loop.
1723 */
1724 brw_inst *
1725 brw_DO(struct brw_codegen *p, unsigned execute_size)
1726 {
1727 const struct gen_device_info *devinfo = p->devinfo;
1728
1729 if (devinfo->gen >= 6 || p->single_program_flow) {
1730 push_loop_stack(p, &p->store[p->nr_insn]);
1731 return &p->store[p->nr_insn];
1732 } else {
1733 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1734
1735 push_loop_stack(p, insn);
1736
1737 /* Override the defaults for this instruction:
1738 */
1739 brw_set_dest(p, insn, brw_null_reg());
1740 brw_set_src0(p, insn, brw_null_reg());
1741 brw_set_src1(p, insn, brw_null_reg());
1742
1743 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1744 brw_inst_set_exec_size(devinfo, insn, execute_size);
1745 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1746
1747 return insn;
1748 }
1749 }
1750
1751 /**
1752 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1753 * instruction here.
1754 *
1755 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1756 * nesting, since it can always just point to the end of the block/current loop.
1757 */
1758 static void
1759 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1760 {
1761 const struct gen_device_info *devinfo = p->devinfo;
1762 brw_inst *do_inst = get_inner_do_insn(p);
1763 brw_inst *inst;
1764 unsigned br = brw_jump_scale(devinfo);
1765
1766 assert(devinfo->gen < 6);
1767
1768 for (inst = while_inst - 1; inst != do_inst; inst--) {
1769 /* If the jump count is != 0, that means that this instruction has already
1770 * been patched because it's part of a loop inside of the one we're
1771 * patching.
1772 */
1773 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1774 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1775 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1776 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1777 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1778 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1779 }
1780 }
1781 }
1782
1783 brw_inst *
1784 brw_WHILE(struct brw_codegen *p)
1785 {
1786 const struct gen_device_info *devinfo = p->devinfo;
1787 brw_inst *insn, *do_insn;
1788 unsigned br = brw_jump_scale(devinfo);
1789
1790 if (devinfo->gen >= 6) {
1791 insn = next_insn(p, BRW_OPCODE_WHILE);
1792 do_insn = get_inner_do_insn(p);
1793
1794 if (devinfo->gen >= 8) {
1795 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1796 brw_set_src0(p, insn, brw_imm_d(0));
1797 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1798 } else if (devinfo->gen == 7) {
1799 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1800 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1801 brw_set_src1(p, insn, brw_imm_w(0));
1802 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1803 } else {
1804 brw_set_dest(p, insn, brw_imm_w(0));
1805 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1806 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1807 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1808 }
1809
1810 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1811
1812 } else {
1813 if (p->single_program_flow) {
1814 insn = next_insn(p, BRW_OPCODE_ADD);
1815 do_insn = get_inner_do_insn(p);
1816
1817 brw_set_dest(p, insn, brw_ip_reg());
1818 brw_set_src0(p, insn, brw_ip_reg());
1819 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1820 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1821 } else {
1822 insn = next_insn(p, BRW_OPCODE_WHILE);
1823 do_insn = get_inner_do_insn(p);
1824
1825 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1826
1827 brw_set_dest(p, insn, brw_ip_reg());
1828 brw_set_src0(p, insn, brw_ip_reg());
1829 brw_set_src1(p, insn, brw_imm_d(0));
1830
1831 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1832 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1833 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1834
1835 brw_patch_break_cont(p, insn);
1836 }
1837 }
1838 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1839
1840 p->loop_stack_depth--;
1841
1842 return insn;
1843 }
1844
1845 /* FORWARD JUMPS:
1846 */
1847 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1848 {
1849 const struct gen_device_info *devinfo = p->devinfo;
1850 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1851 unsigned jmpi = 1;
1852
1853 if (devinfo->gen >= 5)
1854 jmpi = 2;
1855
1856 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1857 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1858
1859 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1860 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1861 }
1862
1863 /* To integrate with the above, it makes sense that the comparison
1864 * instruction should populate the flag register. It might be simpler
1865 * just to use the flag reg for most WM tasks?
1866 */
1867 void brw_CMP(struct brw_codegen *p,
1868 struct brw_reg dest,
1869 unsigned conditional,
1870 struct brw_reg src0,
1871 struct brw_reg src1)
1872 {
1873 const struct gen_device_info *devinfo = p->devinfo;
1874 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1875
1876 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1877 brw_set_dest(p, insn, dest);
1878 brw_set_src0(p, insn, src0);
1879 brw_set_src1(p, insn, src1);
1880
1881 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1882 * page says:
1883 * "Any CMP instruction with a null destination must use a {switch}."
1884 *
1885 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1886 * mentioned on their work-arounds pages.
1887 */
1888 if (devinfo->gen == 7) {
1889 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1890 dest.nr == BRW_ARF_NULL) {
1891 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1892 }
1893 }
1894 }
1895
1896 /***********************************************************************
1897 * Helpers for the various SEND message types:
1898 */
1899
1900 /** Extended math function, float[8].
1901 */
1902 void gen4_math(struct brw_codegen *p,
1903 struct brw_reg dest,
1904 unsigned function,
1905 unsigned msg_reg_nr,
1906 struct brw_reg src,
1907 unsigned precision )
1908 {
1909 const struct gen_device_info *devinfo = p->devinfo;
1910 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1911 unsigned data_type;
1912 if (has_scalar_region(src)) {
1913 data_type = BRW_MATH_DATA_SCALAR;
1914 } else {
1915 data_type = BRW_MATH_DATA_VECTOR;
1916 }
1917
1918 assert(devinfo->gen < 6);
1919
1920 /* Example code doesn't set predicate_control for send
1921 * instructions.
1922 */
1923 brw_inst_set_pred_control(devinfo, insn, 0);
1924 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1925
1926 brw_set_dest(p, insn, dest);
1927 brw_set_src0(p, insn, src);
1928 brw_set_math_message(p,
1929 insn,
1930 function,
1931 src.type == BRW_REGISTER_TYPE_D,
1932 precision,
1933 data_type);
1934 }
1935
1936 void gen6_math(struct brw_codegen *p,
1937 struct brw_reg dest,
1938 unsigned function,
1939 struct brw_reg src0,
1940 struct brw_reg src1)
1941 {
1942 const struct gen_device_info *devinfo = p->devinfo;
1943 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1944
1945 assert(devinfo->gen >= 6);
1946
1947 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1948 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1949
1950 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1951 if (devinfo->gen == 6) {
1952 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1953 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1954 }
1955
1956 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1957 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1958 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1959 assert(src0.type != BRW_REGISTER_TYPE_F);
1960 assert(src1.type != BRW_REGISTER_TYPE_F);
1961 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1962 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1963 } else {
1964 assert(src0.type == BRW_REGISTER_TYPE_F);
1965 assert(src1.type == BRW_REGISTER_TYPE_F);
1966 }
1967
1968 /* Source modifiers are ignored for extended math instructions on Gen6. */
1969 if (devinfo->gen == 6) {
1970 assert(!src0.negate);
1971 assert(!src0.abs);
1972 assert(!src1.negate);
1973 assert(!src1.abs);
1974 }
1975
1976 brw_inst_set_math_function(devinfo, insn, function);
1977
1978 brw_set_dest(p, insn, dest);
1979 brw_set_src0(p, insn, src0);
1980 brw_set_src1(p, insn, src1);
1981 }
1982
1983 /**
1984 * Return the right surface index to access the thread scratch space using
1985 * stateless dataport messages.
1986 */
1987 unsigned
1988 brw_scratch_surface_idx(const struct brw_codegen *p)
1989 {
1990 /* The scratch space is thread-local so IA coherency is unnecessary. */
1991 if (p->devinfo->gen >= 8)
1992 return GEN8_BTI_STATELESS_NON_COHERENT;
1993 else
1994 return BRW_BTI_STATELESS;
1995 }
1996
1997 /**
1998 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1999 * using a constant offset per channel.
2000 *
2001 * The offset must be aligned to oword size (16 bytes). Used for
2002 * register spilling.
2003 */
2004 void brw_oword_block_write_scratch(struct brw_codegen *p,
2005 struct brw_reg mrf,
2006 int num_regs,
2007 unsigned offset)
2008 {
2009 const struct gen_device_info *devinfo = p->devinfo;
2010 const unsigned target_cache =
2011 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2012 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2013 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2014 uint32_t msg_type;
2015
2016 if (devinfo->gen >= 6)
2017 offset /= 16;
2018
2019 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2020
2021 const unsigned mlen = 1 + num_regs;
2022
2023 /* Set up the message header. This is g0, with g0.2 filled with
2024 * the offset. We don't want to leave our offset around in g0 or
2025 * it'll screw up texture samples, so set it up inside the message
2026 * reg.
2027 */
2028 {
2029 brw_push_insn_state(p);
2030 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2031 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2032 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2033
2034 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2035
2036 /* set message header global offset field (reg 0, element 2) */
2037 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2038 brw_MOV(p,
2039 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2040 mrf.nr,
2041 2), BRW_REGISTER_TYPE_UD),
2042 brw_imm_ud(offset));
2043
2044 brw_pop_insn_state(p);
2045 }
2046
2047 {
2048 struct brw_reg dest;
2049 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2050 int send_commit_msg;
2051 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2052 BRW_REGISTER_TYPE_UW);
2053
2054 brw_inst_set_compression(devinfo, insn, false);
2055
2056 if (brw_inst_exec_size(devinfo, insn) >= 16)
2057 src_header = vec16(src_header);
2058
2059 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2060 if (devinfo->gen < 6)
2061 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2062
2063 /* Until gen6, writes followed by reads from the same location
2064 * are not guaranteed to be ordered unless write_commit is set.
2065 * If set, then a no-op write is issued to the destination
2066 * register to set a dependency, and a read from the destination
2067 * can be used to ensure the ordering.
2068 *
2069 * For gen6, only writes between different threads need ordering
2070 * protection. Our use of DP writes is all about register
2071 * spilling within a thread.
2072 */
2073 if (devinfo->gen >= 6) {
2074 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2075 send_commit_msg = 0;
2076 } else {
2077 dest = src_header;
2078 send_commit_msg = 1;
2079 }
2080
2081 brw_set_dest(p, insn, dest);
2082 if (devinfo->gen >= 6) {
2083 brw_set_src0(p, insn, mrf);
2084 } else {
2085 brw_set_src0(p, insn, brw_null_reg());
2086 }
2087
2088 if (devinfo->gen >= 6)
2089 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2090 else
2091 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2092
2093 brw_set_dp_write_message(p,
2094 insn,
2095 brw_scratch_surface_idx(p),
2096 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2097 msg_type,
2098 target_cache,
2099 mlen,
2100 true, /* header_present */
2101 0, /* not a render target */
2102 send_commit_msg, /* response_length */
2103 0, /* eot */
2104 send_commit_msg);
2105 }
2106 }
2107
2108
2109 /**
2110 * Read a block of owords (half a GRF each) from the scratch buffer
2111 * using a constant index per channel.
2112 *
2113 * Offset must be aligned to oword size (16 bytes). Used for register
2114 * spilling.
2115 */
2116 void
2117 brw_oword_block_read_scratch(struct brw_codegen *p,
2118 struct brw_reg dest,
2119 struct brw_reg mrf,
2120 int num_regs,
2121 unsigned offset)
2122 {
2123 const struct gen_device_info *devinfo = p->devinfo;
2124
2125 if (devinfo->gen >= 6)
2126 offset /= 16;
2127
2128 if (p->devinfo->gen >= 7) {
2129 /* On gen 7 and above, we no longer have message registers and we can
2130 * send from any register we want. By using the destination register
2131 * for the message, we guarantee that the implied message write won't
2132 * accidentally overwrite anything. This has been a problem because
2133 * the MRF registers and source for the final FB write are both fixed
2134 * and may overlap.
2135 */
2136 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2137 } else {
2138 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2139 }
2140 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2141
2142 const unsigned rlen = num_regs;
2143 const unsigned target_cache =
2144 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2145 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2146 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2147
2148 {
2149 brw_push_insn_state(p);
2150 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2151 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2152 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2153
2154 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2155
2156 /* set message header global offset field (reg 0, element 2) */
2157 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2158 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2159
2160 brw_pop_insn_state(p);
2161 }
2162
2163 {
2164 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2165
2166 assert(brw_inst_pred_control(devinfo, insn) == 0);
2167 brw_inst_set_compression(devinfo, insn, false);
2168
2169 brw_set_dest(p, insn, dest); /* UW? */
2170 if (devinfo->gen >= 6) {
2171 brw_set_src0(p, insn, mrf);
2172 } else {
2173 brw_set_src0(p, insn, brw_null_reg());
2174 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2175 }
2176
2177 brw_set_dp_read_message(p,
2178 insn,
2179 brw_scratch_surface_idx(p),
2180 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2181 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2182 target_cache,
2183 1, /* msg_length */
2184 true, /* header_present */
2185 rlen);
2186 }
2187 }
2188
2189 void
2190 gen7_block_read_scratch(struct brw_codegen *p,
2191 struct brw_reg dest,
2192 int num_regs,
2193 unsigned offset)
2194 {
2195 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2196 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2197
2198 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2199
2200 /* The HW requires that the header is present; this is to get the g0.5
2201 * scratch offset.
2202 */
2203 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2204
2205 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2206 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2207 * is 32 bytes, which happens to be the size of a register.
2208 */
2209 offset /= REG_SIZE;
2210 assert(offset < (1 << 12));
2211
2212 gen7_set_dp_scratch_message(p, insn,
2213 false, /* scratch read */
2214 false, /* OWords */
2215 false, /* invalidate after read */
2216 num_regs,
2217 offset,
2218 1, /* mlen: just g0 */
2219 num_regs, /* rlen */
2220 true); /* header present */
2221 }
2222
2223 /**
2224 * Read float[4] vectors from the data port constant cache.
2225 * Location (in buffer) should be a multiple of 16.
2226 * Used for fetching shader constants.
2227 */
2228 void brw_oword_block_read(struct brw_codegen *p,
2229 struct brw_reg dest,
2230 struct brw_reg mrf,
2231 uint32_t offset,
2232 uint32_t bind_table_index)
2233 {
2234 const struct gen_device_info *devinfo = p->devinfo;
2235 const unsigned target_cache =
2236 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2237 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2238 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2239
2240 /* On newer hardware, offset is in units of owords. */
2241 if (devinfo->gen >= 6)
2242 offset /= 16;
2243
2244 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2245
2246 brw_push_insn_state(p);
2247 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2248 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2249 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2250
2251 brw_push_insn_state(p);
2252 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2253 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2254
2255 /* set message header global offset field (reg 0, element 2) */
2256 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2257 brw_MOV(p,
2258 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2259 mrf.nr,
2260 2), BRW_REGISTER_TYPE_UD),
2261 brw_imm_ud(offset));
2262 brw_pop_insn_state(p);
2263
2264 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2265
2266 /* cast dest to a uword[8] vector */
2267 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2268
2269 brw_set_dest(p, insn, dest);
2270 if (devinfo->gen >= 6) {
2271 brw_set_src0(p, insn, mrf);
2272 } else {
2273 brw_set_src0(p, insn, brw_null_reg());
2274 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2275 }
2276
2277 brw_set_dp_read_message(p, insn, bind_table_index,
2278 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2279 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2280 target_cache,
2281 1, /* msg_length */
2282 true, /* header_present */
2283 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2284
2285 brw_pop_insn_state(p);
2286 }
2287
2288 brw_inst *
2289 brw_fb_WRITE(struct brw_codegen *p,
2290 struct brw_reg payload,
2291 struct brw_reg implied_header,
2292 unsigned msg_control,
2293 unsigned binding_table_index,
2294 unsigned msg_length,
2295 unsigned response_length,
2296 bool eot,
2297 bool last_render_target,
2298 bool header_present)
2299 {
2300 const struct gen_device_info *devinfo = p->devinfo;
2301 const unsigned target_cache =
2302 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2303 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2304 brw_inst *insn;
2305 unsigned msg_type;
2306 struct brw_reg dest, src0;
2307
2308 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2309 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2310 else
2311 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2312
2313 if (devinfo->gen >= 6) {
2314 insn = next_insn(p, BRW_OPCODE_SENDC);
2315 } else {
2316 insn = next_insn(p, BRW_OPCODE_SEND);
2317 }
2318 brw_inst_set_compression(devinfo, insn, false);
2319
2320 if (devinfo->gen >= 6) {
2321 /* headerless version, just submit color payload */
2322 src0 = payload;
2323
2324 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2325 } else {
2326 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2327 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2328 src0 = implied_header;
2329
2330 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2331 }
2332
2333 brw_set_dest(p, insn, dest);
2334 brw_set_src0(p, insn, src0);
2335 brw_set_dp_write_message(p,
2336 insn,
2337 binding_table_index,
2338 msg_control,
2339 msg_type,
2340 target_cache,
2341 msg_length,
2342 header_present,
2343 last_render_target,
2344 response_length,
2345 eot,
2346 0 /* send_commit_msg */);
2347
2348 return insn;
2349 }
2350
2351 brw_inst *
2352 gen9_fb_READ(struct brw_codegen *p,
2353 struct brw_reg dst,
2354 struct brw_reg payload,
2355 unsigned binding_table_index,
2356 unsigned msg_length,
2357 unsigned response_length,
2358 bool per_sample)
2359 {
2360 const struct gen_device_info *devinfo = p->devinfo;
2361 assert(devinfo->gen >= 9);
2362 const unsigned msg_subtype =
2363 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2364 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2365
2366 brw_set_dest(p, insn, dst);
2367 brw_set_src0(p, insn, payload);
2368 brw_set_dp_read_message(p, insn, binding_table_index,
2369 per_sample << 5 | msg_subtype,
2370 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2371 GEN6_SFID_DATAPORT_RENDER_CACHE,
2372 msg_length, true /* header_present */,
2373 response_length);
2374 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2375
2376 return insn;
2377 }
2378
2379 /**
2380 * Texture sample instruction.
2381 * Note: the msg_type plus msg_length values determine exactly what kind
2382 * of sampling operation is performed. See volume 4, page 161 of docs.
2383 */
2384 void brw_SAMPLE(struct brw_codegen *p,
2385 struct brw_reg dest,
2386 unsigned msg_reg_nr,
2387 struct brw_reg src0,
2388 unsigned binding_table_index,
2389 unsigned sampler,
2390 unsigned msg_type,
2391 unsigned response_length,
2392 unsigned msg_length,
2393 unsigned header_present,
2394 unsigned simd_mode,
2395 unsigned return_format)
2396 {
2397 const struct gen_device_info *devinfo = p->devinfo;
2398 brw_inst *insn;
2399
2400 if (msg_reg_nr != -1)
2401 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2402
2403 insn = next_insn(p, BRW_OPCODE_SEND);
2404 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2405
2406 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2407 *
2408 * "Instruction compression is not allowed for this instruction (that
2409 * is, send). The hardware behavior is undefined if this instruction is
2410 * set as compressed. However, compress control can be set to "SecHalf"
2411 * to affect the EMask generation."
2412 *
2413 * No similar wording is found in later PRMs, but there are examples
2414 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2415 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2416 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2417 */
2418 brw_inst_set_compression(devinfo, insn, false);
2419
2420 if (devinfo->gen < 6)
2421 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2422
2423 brw_set_dest(p, insn, dest);
2424 brw_set_src0(p, insn, src0);
2425 brw_set_sampler_message(p, insn,
2426 binding_table_index,
2427 sampler,
2428 msg_type,
2429 response_length,
2430 msg_length,
2431 header_present,
2432 simd_mode,
2433 return_format);
2434 }
2435
2436 /* Adjust the message header's sampler state pointer to
2437 * select the correct group of 16 samplers.
2438 */
2439 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2440 struct brw_reg header,
2441 struct brw_reg sampler_index)
2442 {
2443 /* The "Sampler Index" field can only store values between 0 and 15.
2444 * However, we can add an offset to the "Sampler State Pointer"
2445 * field, effectively selecting a different set of 16 samplers.
2446 *
2447 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2448 * offset, and each sampler state is only 16-bytes, so we can't
2449 * exclusively use the offset - we have to use both.
2450 */
2451
2452 const struct gen_device_info *devinfo = p->devinfo;
2453
2454 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2455 const int sampler_state_size = 16; /* 16 bytes */
2456 uint32_t sampler = sampler_index.ud;
2457
2458 if (sampler >= 16) {
2459 assert(devinfo->is_haswell || devinfo->gen >= 8);
2460 brw_ADD(p,
2461 get_element_ud(header, 3),
2462 get_element_ud(brw_vec8_grf(0, 0), 3),
2463 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2464 }
2465 } else {
2466 /* Non-const sampler array indexing case */
2467 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2468 return;
2469 }
2470
2471 struct brw_reg temp = get_element_ud(header, 3);
2472
2473 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2474 brw_SHL(p, temp, temp, brw_imm_ud(4));
2475 brw_ADD(p,
2476 get_element_ud(header, 3),
2477 get_element_ud(brw_vec8_grf(0, 0), 3),
2478 temp);
2479 }
2480 }
2481
2482 /* All these variables are pretty confusing - we might be better off
2483 * using bitmasks and macros for this, in the old style. Or perhaps
2484 * just having the caller instantiate the fields in dword3 itself.
2485 */
2486 void brw_urb_WRITE(struct brw_codegen *p,
2487 struct brw_reg dest,
2488 unsigned msg_reg_nr,
2489 struct brw_reg src0,
2490 enum brw_urb_write_flags flags,
2491 unsigned msg_length,
2492 unsigned response_length,
2493 unsigned offset,
2494 unsigned swizzle)
2495 {
2496 const struct gen_device_info *devinfo = p->devinfo;
2497 brw_inst *insn;
2498
2499 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2500
2501 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2502 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2503 brw_push_insn_state(p);
2504 brw_set_default_access_mode(p, BRW_ALIGN_1);
2505 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2506 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2507 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2508 BRW_REGISTER_TYPE_UD),
2509 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2510 brw_imm_ud(0xff00));
2511 brw_pop_insn_state(p);
2512 }
2513
2514 insn = next_insn(p, BRW_OPCODE_SEND);
2515
2516 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2517
2518 brw_set_dest(p, insn, dest);
2519 brw_set_src0(p, insn, src0);
2520 brw_set_src1(p, insn, brw_imm_d(0));
2521
2522 if (devinfo->gen < 6)
2523 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2524
2525 brw_set_urb_message(p,
2526 insn,
2527 flags,
2528 msg_length,
2529 response_length,
2530 offset,
2531 swizzle);
2532 }
2533
2534 struct brw_inst *
2535 brw_send_indirect_message(struct brw_codegen *p,
2536 unsigned sfid,
2537 struct brw_reg dst,
2538 struct brw_reg payload,
2539 struct brw_reg desc)
2540 {
2541 const struct gen_device_info *devinfo = p->devinfo;
2542 struct brw_inst *send;
2543 int setup;
2544
2545 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2546
2547 assert(desc.type == BRW_REGISTER_TYPE_UD);
2548
2549 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2550 * in the indirect case) by its index in the instruction store. The
2551 * pointer returned by next_insn() may become invalid if emitting the SEND
2552 * in the indirect case reallocs the store.
2553 */
2554
2555 if (desc.file == BRW_IMMEDIATE_VALUE) {
2556 setup = p->nr_insn;
2557 send = next_insn(p, BRW_OPCODE_SEND);
2558 brw_set_src1(p, send, desc);
2559
2560 } else {
2561 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2562
2563 brw_push_insn_state(p);
2564 brw_set_default_access_mode(p, BRW_ALIGN_1);
2565 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2566 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2567 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2568
2569 /* Load the indirect descriptor to an address register using OR so the
2570 * caller can specify additional descriptor bits with the usual
2571 * brw_set_*_message() helper functions.
2572 */
2573 setup = p->nr_insn;
2574 brw_OR(p, addr, desc, brw_imm_ud(0));
2575
2576 brw_pop_insn_state(p);
2577
2578 send = next_insn(p, BRW_OPCODE_SEND);
2579 brw_set_src1(p, send, addr);
2580 }
2581
2582 if (dst.width < BRW_EXECUTE_8)
2583 brw_inst_set_exec_size(devinfo, send, dst.width);
2584
2585 brw_set_dest(p, send, dst);
2586 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2587 brw_inst_set_sfid(devinfo, send, sfid);
2588
2589 return &p->store[setup];
2590 }
2591
2592 static struct brw_inst *
2593 brw_send_indirect_surface_message(struct brw_codegen *p,
2594 unsigned sfid,
2595 struct brw_reg dst,
2596 struct brw_reg payload,
2597 struct brw_reg surface,
2598 unsigned message_len,
2599 unsigned response_len,
2600 bool header_present)
2601 {
2602 const struct gen_device_info *devinfo = p->devinfo;
2603 struct brw_inst *insn;
2604
2605 if (surface.file != BRW_IMMEDIATE_VALUE) {
2606 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2607
2608 brw_push_insn_state(p);
2609 brw_set_default_access_mode(p, BRW_ALIGN_1);
2610 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2611 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2612 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2613
2614 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2615 * some surface array is accessed out of bounds.
2616 */
2617 insn = brw_AND(p, addr,
2618 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2619 BRW_GET_SWZ(surface.swizzle, 0)),
2620 brw_imm_ud(0xff));
2621
2622 brw_pop_insn_state(p);
2623
2624 surface = addr;
2625 }
2626
2627 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2628 brw_inst_set_mlen(devinfo, insn, message_len);
2629 brw_inst_set_rlen(devinfo, insn, response_len);
2630 brw_inst_set_header_present(devinfo, insn, header_present);
2631
2632 return insn;
2633 }
2634
2635 static bool
2636 while_jumps_before_offset(const struct gen_device_info *devinfo,
2637 brw_inst *insn, int while_offset, int start_offset)
2638 {
2639 int scale = 16 / brw_jump_scale(devinfo);
2640 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2641 : brw_inst_jip(devinfo, insn);
2642 assert(jip < 0);
2643 return while_offset + jip * scale <= start_offset;
2644 }
2645
2646
2647 static int
2648 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2649 {
2650 int offset;
2651 void *store = p->store;
2652 const struct gen_device_info *devinfo = p->devinfo;
2653
2654 int depth = 0;
2655
2656 for (offset = next_offset(devinfo, store, start_offset);
2657 offset < p->next_insn_offset;
2658 offset = next_offset(devinfo, store, offset)) {
2659 brw_inst *insn = store + offset;
2660
2661 switch (brw_inst_opcode(devinfo, insn)) {
2662 case BRW_OPCODE_IF:
2663 depth++;
2664 break;
2665 case BRW_OPCODE_ENDIF:
2666 if (depth == 0)
2667 return offset;
2668 depth--;
2669 break;
2670 case BRW_OPCODE_WHILE:
2671 /* If the while doesn't jump before our instruction, it's the end
2672 * of a sibling do...while loop. Ignore it.
2673 */
2674 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2675 continue;
2676 /* fallthrough */
2677 case BRW_OPCODE_ELSE:
2678 case BRW_OPCODE_HALT:
2679 if (depth == 0)
2680 return offset;
2681 }
2682 }
2683
2684 return 0;
2685 }
2686
2687 /* There is no DO instruction on gen6, so to find the end of the loop
2688 * we have to see if the loop is jumping back before our start
2689 * instruction.
2690 */
2691 static int
2692 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2693 {
2694 const struct gen_device_info *devinfo = p->devinfo;
2695 int offset;
2696 void *store = p->store;
2697
2698 assert(devinfo->gen >= 6);
2699
2700 /* Always start after the instruction (such as a WHILE) we're trying to fix
2701 * up.
2702 */
2703 for (offset = next_offset(devinfo, store, start_offset);
2704 offset < p->next_insn_offset;
2705 offset = next_offset(devinfo, store, offset)) {
2706 brw_inst *insn = store + offset;
2707
2708 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2709 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2710 return offset;
2711 }
2712 }
2713 assert(!"not reached");
2714 return start_offset;
2715 }
2716
2717 /* After program generation, go back and update the UIP and JIP of
2718 * BREAK, CONT, and HALT instructions to their correct locations.
2719 */
2720 void
2721 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2722 {
2723 const struct gen_device_info *devinfo = p->devinfo;
2724 int offset;
2725 int br = brw_jump_scale(devinfo);
2726 int scale = 16 / br;
2727 void *store = p->store;
2728
2729 if (devinfo->gen < 6)
2730 return;
2731
2732 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2733 brw_inst *insn = store + offset;
2734 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2735
2736 int block_end_offset = brw_find_next_block_end(p, offset);
2737 switch (brw_inst_opcode(devinfo, insn)) {
2738 case BRW_OPCODE_BREAK:
2739 assert(block_end_offset != 0);
2740 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2741 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2742 brw_inst_set_uip(devinfo, insn,
2743 (brw_find_loop_end(p, offset) - offset +
2744 (devinfo->gen == 6 ? 16 : 0)) / scale);
2745 break;
2746 case BRW_OPCODE_CONTINUE:
2747 assert(block_end_offset != 0);
2748 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2749 brw_inst_set_uip(devinfo, insn,
2750 (brw_find_loop_end(p, offset) - offset) / scale);
2751
2752 assert(brw_inst_uip(devinfo, insn) != 0);
2753 assert(brw_inst_jip(devinfo, insn) != 0);
2754 break;
2755
2756 case BRW_OPCODE_ENDIF: {
2757 int32_t jump = (block_end_offset == 0) ?
2758 1 * br : (block_end_offset - offset) / scale;
2759 if (devinfo->gen >= 7)
2760 brw_inst_set_jip(devinfo, insn, jump);
2761 else
2762 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2763 break;
2764 }
2765
2766 case BRW_OPCODE_HALT:
2767 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2768 *
2769 * "In case of the halt instruction not inside any conditional
2770 * code block, the value of <JIP> and <UIP> should be the
2771 * same. In case of the halt instruction inside conditional code
2772 * block, the <UIP> should be the end of the program, and the
2773 * <JIP> should be end of the most inner conditional code block."
2774 *
2775 * The uip will have already been set by whoever set up the
2776 * instruction.
2777 */
2778 if (block_end_offset == 0) {
2779 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2780 } else {
2781 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2782 }
2783 assert(brw_inst_uip(devinfo, insn) != 0);
2784 assert(brw_inst_jip(devinfo, insn) != 0);
2785 break;
2786 }
2787 }
2788 }
2789
2790 void brw_ff_sync(struct brw_codegen *p,
2791 struct brw_reg dest,
2792 unsigned msg_reg_nr,
2793 struct brw_reg src0,
2794 bool allocate,
2795 unsigned response_length,
2796 bool eot)
2797 {
2798 const struct gen_device_info *devinfo = p->devinfo;
2799 brw_inst *insn;
2800
2801 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2802
2803 insn = next_insn(p, BRW_OPCODE_SEND);
2804 brw_set_dest(p, insn, dest);
2805 brw_set_src0(p, insn, src0);
2806 brw_set_src1(p, insn, brw_imm_d(0));
2807
2808 if (devinfo->gen < 6)
2809 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2810
2811 brw_set_ff_sync_message(p,
2812 insn,
2813 allocate,
2814 response_length,
2815 eot);
2816 }
2817
2818 /**
2819 * Emit the SEND instruction necessary to generate stream output data on Gen6
2820 * (for transform feedback).
2821 *
2822 * If send_commit_msg is true, this is the last piece of stream output data
2823 * from this thread, so send the data as a committed write. According to the
2824 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2825 *
2826 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2827 * writes are complete by sending the final write as a committed write."
2828 */
2829 void
2830 brw_svb_write(struct brw_codegen *p,
2831 struct brw_reg dest,
2832 unsigned msg_reg_nr,
2833 struct brw_reg src0,
2834 unsigned binding_table_index,
2835 bool send_commit_msg)
2836 {
2837 const struct gen_device_info *devinfo = p->devinfo;
2838 const unsigned target_cache =
2839 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2840 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2841 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2842 brw_inst *insn;
2843
2844 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2845
2846 insn = next_insn(p, BRW_OPCODE_SEND);
2847 brw_set_dest(p, insn, dest);
2848 brw_set_src0(p, insn, src0);
2849 brw_set_src1(p, insn, brw_imm_d(0));
2850 brw_set_dp_write_message(p, insn,
2851 binding_table_index,
2852 0, /* msg_control: ignored */
2853 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2854 target_cache,
2855 1, /* msg_length */
2856 true, /* header_present */
2857 0, /* last_render_target: ignored */
2858 send_commit_msg, /* response_length */
2859 0, /* end_of_thread */
2860 send_commit_msg); /* send_commit_msg */
2861 }
2862
2863 static unsigned
2864 brw_surface_payload_size(struct brw_codegen *p,
2865 unsigned num_channels,
2866 bool has_simd4x2,
2867 bool has_simd16)
2868 {
2869 if (has_simd4x2 && brw_get_default_access_mode(p) == BRW_ALIGN_16)
2870 return 1;
2871 else if (has_simd16 && brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2872 return 2 * num_channels;
2873 else
2874 return num_channels;
2875 }
2876
2877 static void
2878 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2879 brw_inst *insn,
2880 unsigned atomic_op,
2881 bool response_expected)
2882 {
2883 const struct gen_device_info *devinfo = p->devinfo;
2884 unsigned msg_control =
2885 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2886 (response_expected ? 1 << 5 : 0); /* Return data expected */
2887
2888 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2889 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2890 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2891 msg_control |= 1 << 4; /* SIMD8 mode */
2892
2893 brw_inst_set_dp_msg_type(devinfo, insn,
2894 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2895 } else {
2896 brw_inst_set_dp_msg_type(devinfo, insn,
2897 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2898 }
2899 } else {
2900 brw_inst_set_dp_msg_type(devinfo, insn,
2901 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2902
2903 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2904 msg_control |= 1 << 4; /* SIMD8 mode */
2905 }
2906
2907 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2908 }
2909
2910 void
2911 brw_untyped_atomic(struct brw_codegen *p,
2912 struct brw_reg dst,
2913 struct brw_reg payload,
2914 struct brw_reg surface,
2915 unsigned atomic_op,
2916 unsigned msg_length,
2917 bool response_expected,
2918 bool header_present)
2919 {
2920 const struct gen_device_info *devinfo = p->devinfo;
2921 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2922 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2923 GEN7_SFID_DATAPORT_DATA_CACHE);
2924 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2925 /* Mask out unused components -- This is especially important in Align16
2926 * mode on generations that don't have native support for SIMD4x2 atomics,
2927 * because unused but enabled components will cause the dataport to perform
2928 * additional atomic operations on the addresses that happen to be in the
2929 * uninitialized Y, Z and W coordinates of the payload.
2930 */
2931 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2932 struct brw_inst *insn = brw_send_indirect_surface_message(
2933 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2934 brw_surface_payload_size(p, response_expected,
2935 devinfo->gen >= 8 || devinfo->is_haswell, true),
2936 header_present);
2937
2938 brw_set_dp_untyped_atomic_message(
2939 p, insn, atomic_op, response_expected);
2940 }
2941
2942 static void
2943 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2944 struct brw_inst *insn,
2945 unsigned num_channels)
2946 {
2947 const struct gen_device_info *devinfo = p->devinfo;
2948 /* Set mask of 32-bit channels to drop. */
2949 unsigned msg_control = 0xf & (0xf << num_channels);
2950
2951 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2952 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2953 msg_control |= 1 << 4; /* SIMD16 mode */
2954 else
2955 msg_control |= 2 << 4; /* SIMD8 mode */
2956 }
2957
2958 brw_inst_set_dp_msg_type(devinfo, insn,
2959 (devinfo->gen >= 8 || devinfo->is_haswell ?
2960 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2961 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2962 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2963 }
2964
2965 void
2966 brw_untyped_surface_read(struct brw_codegen *p,
2967 struct brw_reg dst,
2968 struct brw_reg payload,
2969 struct brw_reg surface,
2970 unsigned msg_length,
2971 unsigned num_channels)
2972 {
2973 const struct gen_device_info *devinfo = p->devinfo;
2974 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2975 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2976 GEN7_SFID_DATAPORT_DATA_CACHE);
2977 struct brw_inst *insn = brw_send_indirect_surface_message(
2978 p, sfid, dst, payload, surface, msg_length,
2979 brw_surface_payload_size(p, num_channels, true, true),
2980 false);
2981
2982 brw_set_dp_untyped_surface_read_message(
2983 p, insn, num_channels);
2984 }
2985
2986 static void
2987 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2988 struct brw_inst *insn,
2989 unsigned num_channels)
2990 {
2991 const struct gen_device_info *devinfo = p->devinfo;
2992 /* Set mask of 32-bit channels to drop. */
2993 unsigned msg_control = 0xf & (0xf << num_channels);
2994
2995 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2996 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2997 msg_control |= 1 << 4; /* SIMD16 mode */
2998 else
2999 msg_control |= 2 << 4; /* SIMD8 mode */
3000 } else {
3001 if (devinfo->gen >= 8 || devinfo->is_haswell)
3002 msg_control |= 0 << 4; /* SIMD4x2 mode */
3003 else
3004 msg_control |= 2 << 4; /* SIMD8 mode */
3005 }
3006
3007 brw_inst_set_dp_msg_type(devinfo, insn,
3008 devinfo->gen >= 8 || devinfo->is_haswell ?
3009 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3010 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3011 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3012 }
3013
3014 void
3015 brw_untyped_surface_write(struct brw_codegen *p,
3016 struct brw_reg payload,
3017 struct brw_reg surface,
3018 unsigned msg_length,
3019 unsigned num_channels,
3020 bool header_present)
3021 {
3022 const struct gen_device_info *devinfo = p->devinfo;
3023 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3024 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3025 GEN7_SFID_DATAPORT_DATA_CACHE);
3026 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3027 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3028 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3029 WRITEMASK_X : WRITEMASK_XYZW;
3030 struct brw_inst *insn = brw_send_indirect_surface_message(
3031 p, sfid, brw_writemask(brw_null_reg(), mask),
3032 payload, surface, msg_length, 0, header_present);
3033
3034 brw_set_dp_untyped_surface_write_message(
3035 p, insn, num_channels);
3036 }
3037
3038 static unsigned
3039 brw_byte_scattered_data_element_from_bit_size(unsigned bit_size)
3040 {
3041 switch (bit_size) {
3042 case 8:
3043 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_BYTE;
3044 case 16:
3045 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_WORD;
3046 case 32:
3047 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_DWORD;
3048 default:
3049 unreachable("Unsupported bit_size for byte scattered messages");
3050 }
3051 }
3052
3053
3054 void
3055 brw_byte_scattered_read(struct brw_codegen *p,
3056 struct brw_reg dst,
3057 struct brw_reg payload,
3058 struct brw_reg surface,
3059 unsigned msg_length,
3060 unsigned bit_size)
3061 {
3062 const struct gen_device_info *devinfo = p->devinfo;
3063 assert(devinfo->gen > 7 || devinfo->is_haswell);
3064 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3065 const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3066
3067 struct brw_inst *insn = brw_send_indirect_surface_message(
3068 p, sfid, dst, payload, surface, msg_length,
3069 brw_surface_payload_size(p, 1, true, true),
3070 false);
3071
3072 unsigned msg_control =
3073 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3074
3075 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3076 msg_control |= 1; /* SIMD16 mode */
3077 else
3078 msg_control |= 0; /* SIMD8 mode */
3079
3080 brw_inst_set_dp_msg_type(devinfo, insn,
3081 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
3082 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3083 }
3084
3085 void
3086 brw_byte_scattered_write(struct brw_codegen *p,
3087 struct brw_reg payload,
3088 struct brw_reg surface,
3089 unsigned msg_length,
3090 unsigned bit_size,
3091 bool header_present)
3092 {
3093 const struct gen_device_info *devinfo = p->devinfo;
3094 assert(devinfo->gen > 7 || devinfo->is_haswell);
3095 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3096 const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3097
3098 struct brw_inst *insn = brw_send_indirect_surface_message(
3099 p, sfid, brw_writemask(brw_null_reg(), WRITEMASK_XYZW),
3100 payload, surface, msg_length, 0, header_present);
3101
3102 unsigned msg_control =
3103 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3104
3105 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3106 msg_control |= 1;
3107 else
3108 msg_control |= 0;
3109
3110 brw_inst_set_dp_msg_type(devinfo, insn,
3111 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
3112 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3113 }
3114
3115 static void
3116 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3117 struct brw_inst *insn,
3118 unsigned atomic_op,
3119 bool response_expected)
3120 {
3121 const struct gen_device_info *devinfo = p->devinfo;
3122 unsigned msg_control =
3123 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3124 (response_expected ? 1 << 5 : 0); /* Return data expected */
3125
3126 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3127 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3128 if ((brw_get_default_group(p) / 8) % 2 == 1)
3129 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3130
3131 brw_inst_set_dp_msg_type(devinfo, insn,
3132 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3133 } else {
3134 brw_inst_set_dp_msg_type(devinfo, insn,
3135 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3136 }
3137
3138 } else {
3139 brw_inst_set_dp_msg_type(devinfo, insn,
3140 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3141
3142 if ((brw_get_default_group(p) / 8) % 2 == 1)
3143 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3144 }
3145
3146 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3147 }
3148
3149 void
3150 brw_typed_atomic(struct brw_codegen *p,
3151 struct brw_reg dst,
3152 struct brw_reg payload,
3153 struct brw_reg surface,
3154 unsigned atomic_op,
3155 unsigned msg_length,
3156 bool response_expected,
3157 bool header_present) {
3158 const struct gen_device_info *devinfo = p->devinfo;
3159 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3160 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3161 GEN6_SFID_DATAPORT_RENDER_CACHE);
3162 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3163 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3164 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3165 struct brw_inst *insn = brw_send_indirect_surface_message(
3166 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3167 brw_surface_payload_size(p, response_expected,
3168 devinfo->gen >= 8 || devinfo->is_haswell, false),
3169 header_present);
3170
3171 brw_set_dp_typed_atomic_message(
3172 p, insn, atomic_op, response_expected);
3173 }
3174
3175 static void
3176 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3177 struct brw_inst *insn,
3178 unsigned num_channels)
3179 {
3180 const struct gen_device_info *devinfo = p->devinfo;
3181 /* Set mask of unused channels. */
3182 unsigned msg_control = 0xf & (0xf << num_channels);
3183
3184 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3185 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3186 if ((brw_get_default_group(p) / 8) % 2 == 1)
3187 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3188 else
3189 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3190 }
3191
3192 brw_inst_set_dp_msg_type(devinfo, insn,
3193 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3194 } else {
3195 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3196 if ((brw_get_default_group(p) / 8) % 2 == 1)
3197 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3198 }
3199
3200 brw_inst_set_dp_msg_type(devinfo, insn,
3201 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3202 }
3203
3204 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3205 }
3206
3207 void
3208 brw_typed_surface_read(struct brw_codegen *p,
3209 struct brw_reg dst,
3210 struct brw_reg payload,
3211 struct brw_reg surface,
3212 unsigned msg_length,
3213 unsigned num_channels,
3214 bool header_present)
3215 {
3216 const struct gen_device_info *devinfo = p->devinfo;
3217 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3218 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3219 GEN6_SFID_DATAPORT_RENDER_CACHE);
3220 struct brw_inst *insn = brw_send_indirect_surface_message(
3221 p, sfid, dst, payload, surface, msg_length,
3222 brw_surface_payload_size(p, num_channels,
3223 devinfo->gen >= 8 || devinfo->is_haswell, false),
3224 header_present);
3225
3226 brw_set_dp_typed_surface_read_message(
3227 p, insn, num_channels);
3228 }
3229
3230 static void
3231 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3232 struct brw_inst *insn,
3233 unsigned num_channels)
3234 {
3235 const struct gen_device_info *devinfo = p->devinfo;
3236 /* Set mask of unused channels. */
3237 unsigned msg_control = 0xf & (0xf << num_channels);
3238
3239 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3240 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3241 if ((brw_get_default_group(p) / 8) % 2 == 1)
3242 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3243 else
3244 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3245 }
3246
3247 brw_inst_set_dp_msg_type(devinfo, insn,
3248 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3249
3250 } else {
3251 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3252 if ((brw_get_default_group(p) / 8) % 2 == 1)
3253 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3254 }
3255
3256 brw_inst_set_dp_msg_type(devinfo, insn,
3257 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3258 }
3259
3260 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3261 }
3262
3263 void
3264 brw_typed_surface_write(struct brw_codegen *p,
3265 struct brw_reg payload,
3266 struct brw_reg surface,
3267 unsigned msg_length,
3268 unsigned num_channels,
3269 bool header_present)
3270 {
3271 const struct gen_device_info *devinfo = p->devinfo;
3272 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3273 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3274 GEN6_SFID_DATAPORT_RENDER_CACHE);
3275 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3276 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3277 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3278 WRITEMASK_X : WRITEMASK_XYZW);
3279 struct brw_inst *insn = brw_send_indirect_surface_message(
3280 p, sfid, brw_writemask(brw_null_reg(), mask),
3281 payload, surface, msg_length, 0, header_present);
3282
3283 brw_set_dp_typed_surface_write_message(
3284 p, insn, num_channels);
3285 }
3286
3287 static void
3288 brw_set_memory_fence_message(struct brw_codegen *p,
3289 struct brw_inst *insn,
3290 enum brw_message_target sfid,
3291 bool commit_enable)
3292 {
3293 const struct gen_device_info *devinfo = p->devinfo;
3294
3295 brw_set_message_descriptor(p, insn, sfid,
3296 1 /* message length */,
3297 (commit_enable ? 1 : 0) /* response length */,
3298 true /* header present */,
3299 false);
3300
3301 switch (sfid) {
3302 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3303 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3304 break;
3305 case GEN7_SFID_DATAPORT_DATA_CACHE:
3306 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3307 break;
3308 default:
3309 unreachable("Not reached");
3310 }
3311
3312 if (commit_enable)
3313 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3314 }
3315
3316 void
3317 brw_memory_fence(struct brw_codegen *p,
3318 struct brw_reg dst,
3319 enum opcode send_op)
3320 {
3321 const struct gen_device_info *devinfo = p->devinfo;
3322 const bool commit_enable =
3323 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3324 (devinfo->gen == 7 && !devinfo->is_haswell);
3325 struct brw_inst *insn;
3326
3327 brw_push_insn_state(p);
3328 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3329 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3330 dst = vec1(dst);
3331
3332 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3333 * message doesn't write anything back.
3334 */
3335 insn = next_insn(p, send_op);
3336 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3337 brw_set_dest(p, insn, dst);
3338 brw_set_src0(p, insn, dst);
3339 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3340 commit_enable);
3341
3342 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3343 /* IVB does typed surface access through the render cache, so we need to
3344 * flush it too. Use a different register so both flushes can be
3345 * pipelined by the hardware.
3346 */
3347 insn = next_insn(p, send_op);
3348 brw_set_dest(p, insn, offset(dst, 1));
3349 brw_set_src0(p, insn, offset(dst, 1));
3350 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3351 commit_enable);
3352
3353 /* Now write the response of the second message into the response of the
3354 * first to trigger a pipeline stall -- This way future render and data
3355 * cache messages will be properly ordered with respect to past data and
3356 * render cache messages.
3357 */
3358 brw_MOV(p, dst, offset(dst, 1));
3359 }
3360
3361 brw_pop_insn_state(p);
3362 }
3363
3364 void
3365 brw_pixel_interpolator_query(struct brw_codegen *p,
3366 struct brw_reg dest,
3367 struct brw_reg mrf,
3368 bool noperspective,
3369 unsigned mode,
3370 struct brw_reg data,
3371 unsigned msg_length,
3372 unsigned response_length)
3373 {
3374 const struct gen_device_info *devinfo = p->devinfo;
3375 struct brw_inst *insn;
3376 const uint16_t exec_size = brw_get_default_exec_size(p);
3377 const uint16_t qtr_ctrl = brw_get_default_group(p) / 8;
3378
3379 /* brw_send_indirect_message will automatically use a direct send message
3380 * if data is actually immediate.
3381 */
3382 insn = brw_send_indirect_message(p,
3383 GEN7_SFID_PIXEL_INTERPOLATOR,
3384 dest,
3385 mrf,
3386 vec1(data));
3387 brw_inst_set_mlen(devinfo, insn, msg_length);
3388 brw_inst_set_rlen(devinfo, insn, response_length);
3389
3390 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3391 brw_inst_set_pi_slot_group(devinfo, insn, qtr_ctrl / 2);
3392 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3393 brw_inst_set_pi_message_type(devinfo, insn, mode);
3394 }
3395
3396 void
3397 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3398 struct brw_reg mask)
3399 {
3400 const struct gen_device_info *devinfo = p->devinfo;
3401 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3402 const unsigned qtr_control = brw_get_default_group(p) / 8;
3403 brw_inst *inst;
3404
3405 assert(devinfo->gen >= 7);
3406 assert(mask.type == BRW_REGISTER_TYPE_UD);
3407
3408 brw_push_insn_state(p);
3409
3410 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3411 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3412
3413 if (devinfo->gen >= 8) {
3414 /* Getting the first active channel index is easy on Gen8: Just find
3415 * the first bit set in the execution mask. The register exists on
3416 * HSW already but it reads back as all ones when the current
3417 * instruction has execution masking disabled, so it's kind of
3418 * useless.
3419 */
3420 struct brw_reg exec_mask =
3421 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3422
3423 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3424 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3425 /* Unfortunately, ce0 does not take into account the thread
3426 * dispatch mask, which may be a problem in cases where it's not
3427 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3428 * some n). Combine ce0 with the given dispatch (or vector) mask
3429 * to mask off those channels which were never dispatched by the
3430 * hardware.
3431 */
3432 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3433 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3434 exec_mask = vec1(dst);
3435 }
3436
3437 /* Quarter control has the effect of magically shifting the value of
3438 * ce0 so you'll get the first active channel relative to the
3439 * specified quarter control as result.
3440 */
3441 inst = brw_FBL(p, vec1(dst), exec_mask);
3442 } else {
3443 const struct brw_reg flag = brw_flag_reg(p->current->flag_subreg / 2,
3444 p->current->flag_subreg % 2);
3445
3446 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3447 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3448
3449 /* Run enough instructions returning zero with execution masking and
3450 * a conditional modifier enabled in order to get the full execution
3451 * mask in f1.0. We could use a single 32-wide move here if it
3452 * weren't because of the hardware bug that causes channel enables to
3453 * be applied incorrectly to the second half of 32-wide instructions
3454 * on Gen7.
3455 */
3456 const unsigned lower_size = MIN2(16, exec_size);
3457 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3458 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3459 brw_imm_uw(0));
3460 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3461 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3462 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3463 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3464 }
3465
3466 /* Find the first bit set in the exec_size-wide portion of the flag
3467 * register that was updated by the last sequence of MOV
3468 * instructions.
3469 */
3470 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3471 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3472 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3473 }
3474 } else {
3475 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3476
3477 if (devinfo->gen >= 8 &&
3478 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3479 /* In SIMD4x2 mode the first active channel index is just the
3480 * negation of the first bit of the mask register. Note that ce0
3481 * doesn't take into account the dispatch mask, so the Gen7 path
3482 * should be used instead unless you have the guarantee that the
3483 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3484 * for some n).
3485 */
3486 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3487 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3488 brw_imm_ud(1));
3489
3490 } else {
3491 /* Overwrite the destination without and with execution masking to
3492 * find out which of the channels is active.
3493 */
3494 brw_push_insn_state(p);
3495 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3496 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3497 brw_imm_ud(1));
3498
3499 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3500 brw_imm_ud(0));
3501 brw_pop_insn_state(p);
3502 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3503 }
3504 }
3505
3506 brw_pop_insn_state(p);
3507 }
3508
3509 void
3510 brw_broadcast(struct brw_codegen *p,
3511 struct brw_reg dst,
3512 struct brw_reg src,
3513 struct brw_reg idx)
3514 {
3515 const struct gen_device_info *devinfo = p->devinfo;
3516 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3517 brw_inst *inst;
3518
3519 brw_push_insn_state(p);
3520 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3521 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3522
3523 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3524 src.address_mode == BRW_ADDRESS_DIRECT);
3525 assert(!src.abs && !src.negate);
3526 assert(src.type == dst.type);
3527
3528 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3529 idx.file == BRW_IMMEDIATE_VALUE) {
3530 /* Trivial, the source is already uniform or the index is a constant.
3531 * We will typically not get here if the optimizer is doing its job, but
3532 * asserting would be mean.
3533 */
3534 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3535 brw_MOV(p, dst,
3536 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3537 stride(suboffset(src, 4 * i), 0, 4, 1)));
3538 } else {
3539 /* From the Haswell PRM section "Register Region Restrictions":
3540 *
3541 * "The lower bits of the AddressImmediate must not overflow to
3542 * change the register address. The lower 5 bits of Address
3543 * Immediate when added to lower 5 bits of address register gives
3544 * the sub-register offset. The upper bits of Address Immediate
3545 * when added to upper bits of address register gives the register
3546 * address. Any overflow from sub-register offset is dropped."
3547 *
3548 * Fortunately, for broadcast, we never have a sub-register offset so
3549 * this isn't an issue.
3550 */
3551 assert(src.subnr == 0);
3552
3553 if (align1) {
3554 const struct brw_reg addr =
3555 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3556 unsigned offset = src.nr * REG_SIZE + src.subnr;
3557 /* Limit in bytes of the signed indirect addressing immediate. */
3558 const unsigned limit = 512;
3559
3560 brw_push_insn_state(p);
3561 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3562 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3563
3564 /* Take into account the component size and horizontal stride. */
3565 assert(src.vstride == src.hstride + src.width);
3566 brw_SHL(p, addr, vec1(idx),
3567 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3568 src.hstride - 1));
3569
3570 /* We can only address up to limit bytes using the indirect
3571 * addressing immediate, account for the difference if the source
3572 * register is above this limit.
3573 */
3574 if (offset >= limit) {
3575 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3576 offset = offset % limit;
3577 }
3578
3579 brw_pop_insn_state(p);
3580
3581 /* Use indirect addressing to fetch the specified component. */
3582 if (type_sz(src.type) > 4 &&
3583 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3584 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3585 *
3586 * "When source or destination datatype is 64b or operation is
3587 * integer DWord multiply, indirect addressing must not be
3588 * used."
3589 *
3590 * To work around both of this issue, we do two integer MOVs
3591 * insead of one 64-bit MOV. Because no double value should ever
3592 * cross a register boundary, it's safe to use the immediate
3593 * offset in the indirect here to handle adding 4 bytes to the
3594 * offset and avoid the extra ADD to the register file.
3595 */
3596 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3597 retype(brw_vec1_indirect(addr.subnr, offset),
3598 BRW_REGISTER_TYPE_D));
3599 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3600 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3601 BRW_REGISTER_TYPE_D));
3602 } else {
3603 brw_MOV(p, dst,
3604 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3605 }
3606 } else {
3607 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3608 * to all bits of a flag register,
3609 */
3610 inst = brw_MOV(p,
3611 brw_null_reg(),
3612 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3613 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3614 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3615 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3616
3617 /* and use predicated SEL to pick the right channel. */
3618 inst = brw_SEL(p, dst,
3619 stride(suboffset(src, 4), 4, 4, 1),
3620 stride(src, 4, 4, 1));
3621 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3622 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3623 }
3624 }
3625
3626 brw_pop_insn_state(p);
3627 }
3628
3629 /**
3630 * This instruction is generated as a single-channel align1 instruction by
3631 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3632 *
3633 * We can't use the typed atomic op in the FS because that has the execution
3634 * mask ANDed with the pixel mask, but we just want to write the one dword for
3635 * all the pixels.
3636 *
3637 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3638 * one u32. So we use the same untyped atomic write message as the pixel
3639 * shader.
3640 *
3641 * The untyped atomic operation requires a BUFFER surface type with RAW
3642 * format, and is only accessible through the legacy DATA_CACHE dataport
3643 * messages.
3644 */
3645 void brw_shader_time_add(struct brw_codegen *p,
3646 struct brw_reg payload,
3647 uint32_t surf_index)
3648 {
3649 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3650 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3651 GEN7_SFID_DATAPORT_DATA_CACHE);
3652 assert(p->devinfo->gen >= 7);
3653
3654 brw_push_insn_state(p);
3655 brw_set_default_access_mode(p, BRW_ALIGN_1);
3656 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3657 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3658 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3659
3660 /* We use brw_vec1_reg and unmasked because we want to increment the given
3661 * offset only once.
3662 */
3663 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3664 BRW_ARF_NULL, 0));
3665 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3666 payload.nr, 0));
3667 brw_set_src1(p, send, brw_imm_ud(0));
3668 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3669 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3670 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3671
3672 brw_pop_insn_state(p);
3673 }
3674
3675
3676 /**
3677 * Emit the SEND message for a barrier
3678 */
3679 void
3680 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3681 {
3682 const struct gen_device_info *devinfo = p->devinfo;
3683 struct brw_inst *inst;
3684
3685 assert(devinfo->gen >= 7);
3686
3687 brw_push_insn_state(p);
3688 brw_set_default_access_mode(p, BRW_ALIGN_1);
3689 inst = next_insn(p, BRW_OPCODE_SEND);
3690 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3691 brw_set_src0(p, inst, src);
3692 brw_set_src1(p, inst, brw_null_reg());
3693
3694 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3695 1 /* msg_length */,
3696 0 /* response_length */,
3697 false /* header_present */,
3698 false /* end_of_thread */);
3699
3700 brw_inst_set_gateway_notify(devinfo, inst, 1);
3701 brw_inst_set_gateway_subfuncid(devinfo, inst,
3702 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3703
3704 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3705 brw_pop_insn_state(p);
3706 }
3707
3708
3709 /**
3710 * Emit the wait instruction for a barrier
3711 */
3712 void
3713 brw_WAIT(struct brw_codegen *p)
3714 {
3715 const struct gen_device_info *devinfo = p->devinfo;
3716 struct brw_inst *insn;
3717
3718 struct brw_reg src = brw_notification_reg();
3719
3720 insn = next_insn(p, BRW_OPCODE_WAIT);
3721 brw_set_dest(p, insn, src);
3722 brw_set_src0(p, insn, src);
3723 brw_set_src1(p, insn, brw_null_reg());
3724
3725 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3726 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3727 }
3728
3729 /**
3730 * Changes the floating point rounding mode updating the control register
3731 * field defined at cr0.0[5-6] bits. This function supports the changes to
3732 * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3733 * Only RTNE and RTZ rounding are enabled at nir.
3734 */
3735 void
3736 brw_rounding_mode(struct brw_codegen *p,
3737 enum brw_rnd_mode mode)
3738 {
3739 const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
3740
3741 if (bits != BRW_CR0_RND_MODE_MASK) {
3742 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3743 brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
3744 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3745
3746 /* From the Skylake PRM, Volume 7, page 760:
3747 * "Implementation Restriction on Register Access: When the control
3748 * register is used as an explicit source and/or destination, hardware
3749 * does not ensure execution pipeline coherency. Software must set the
3750 * thread control field to ‘switch’ for an instruction that uses
3751 * control register as an explicit operand."
3752 */
3753 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3754 }
3755
3756 if (bits) {
3757 brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3758 brw_imm_ud(bits));
3759 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3760 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3761 }
3762 }