intel/inst: Indent some code
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 gen7_convert_mrf_to_grf(p, &dest);
98
99 {
100 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
101 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
102
103 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
104 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
105
106 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
107 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
108 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
109 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
110 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
111 } else {
112 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
113 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
114 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
115 dest.file == BRW_MESSAGE_REGISTER_FILE) {
116 assert(dest.writemask != 0);
117 }
118 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
119 * Although Dst.HorzStride is a don't care for Align16, HW needs
120 * this to be programmed as "01".
121 */
122 brw_inst_set_dst_hstride(devinfo, inst, 1);
123 }
124 } else {
125 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
126
127 /* These are different sizes in align1 vs align16:
128 */
129 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
130 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
131 dest.indirect_offset);
132 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
133 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
134 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
135 } else {
136 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
137 dest.indirect_offset);
138 /* even ignored in da16, still need to set as '01' */
139 brw_inst_set_dst_hstride(devinfo, inst, 1);
140 }
141 }
142 }
143
144 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
145 * or 16 (SIMD16), as that's normally correct. However, when dealing with
146 * small registers, it can be useful for us to automatically reduce it to
147 * match the register size.
148 */
149 if (p->automatic_exec_sizes) {
150 /*
151 * In platforms that support fp64 we can emit instructions with a width
152 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
153 * these cases we need to make sure that these instructions have their
154 * exec sizes set properly when they are emitted and we can't rely on
155 * this code to fix it.
156 */
157 bool fix_exec_size;
158 if (devinfo->gen >= 6)
159 fix_exec_size = dest.width < BRW_EXECUTE_4;
160 else
161 fix_exec_size = dest.width < BRW_EXECUTE_8;
162
163 if (fix_exec_size)
164 brw_inst_set_exec_size(devinfo, inst, dest.width);
165 }
166 }
167
168 void
169 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
170 {
171 const struct gen_device_info *devinfo = p->devinfo;
172
173 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
174 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
175 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
176 assert(reg.nr < 128);
177
178 gen7_convert_mrf_to_grf(p, &reg);
179
180 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
181 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
182 /* Any source modifiers or regions will be ignored, since this just
183 * identifies the MRF/GRF to start reading the message contents from.
184 * Check for some likely failures.
185 */
186 assert(!reg.negate);
187 assert(!reg.abs);
188 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
189 }
190
191 {
192 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
193 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
194 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
195 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
196
197 if (reg.file == BRW_IMMEDIATE_VALUE) {
198 if (reg.type == BRW_REGISTER_TYPE_DF ||
199 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
200 brw_inst_set_imm_df(devinfo, inst, reg.df);
201 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
202 reg.type == BRW_REGISTER_TYPE_Q)
203 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
204 else
205 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
206
207 if (type_sz(reg.type) < 8) {
208 brw_inst_set_src1_reg_file(devinfo, inst,
209 BRW_ARCHITECTURE_REGISTER_FILE);
210 brw_inst_set_src1_reg_hw_type(devinfo, inst,
211 brw_inst_src0_reg_hw_type(devinfo, inst));
212 }
213 } else {
214 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
215 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
216 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
217 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
218 } else {
219 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
220 }
221 } else {
222 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
223
224 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
225 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
226 } else {
227 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
228 }
229 }
230
231 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
232 if (reg.width == BRW_WIDTH_1 &&
233 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
234 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
235 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
236 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
237 } else {
238 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
239 brw_inst_set_src0_width(devinfo, inst, reg.width);
240 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
241 }
242 } else {
243 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
244 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
245 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
246 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
247 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
248 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
249 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
250 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
251
252 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
253 /* This is an oddity of the fact we're using the same
254 * descriptions for registers in align_16 as align_1:
255 */
256 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
257 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
258 reg.type == BRW_REGISTER_TYPE_DF &&
259 reg.vstride == BRW_VERTICAL_STRIDE_2) {
260 /* From SNB PRM:
261 *
262 * "For Align16 access mode, only encodings of 0000 and 0011
263 * are allowed. Other codes are reserved."
264 *
265 * Presumably the DevSNB behavior applies to IVB as well.
266 */
267 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
268 } else {
269 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
270 }
271 }
272 }
273 }
274 }
275
276
277 void
278 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
279 {
280 const struct gen_device_info *devinfo = p->devinfo;
281
282 if (reg.file == BRW_GENERAL_REGISTER_FILE)
283 assert(reg.nr < 128);
284
285 {
286 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
287 *
288 * "Accumulator registers may be accessed explicitly as src0
289 * operands only."
290 */
291 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
292 reg.nr != BRW_ARF_ACCUMULATOR);
293
294 gen7_convert_mrf_to_grf(p, &reg);
295 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
296
297 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
298 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
299 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
300
301 /* Only src1 can be immediate in two-argument instructions.
302 */
303 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
304
305 if (reg.file == BRW_IMMEDIATE_VALUE) {
306 /* two-argument instructions can only use 32-bit immediates */
307 assert(type_sz(reg.type) < 8);
308 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
309 } else {
310 /* This is a hardware restriction, which may or may not be lifted
311 * in the future:
312 */
313 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
314 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
315
316 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
317 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
318 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
319 } else {
320 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
321 }
322
323 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
324 if (reg.width == BRW_WIDTH_1 &&
325 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
326 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
327 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
328 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
329 } else {
330 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
331 brw_inst_set_src1_width(devinfo, inst, reg.width);
332 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
333 }
334 } else {
335 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
336 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
337 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
338 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
339 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
340 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
341 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
342 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
343
344 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
345 /* This is an oddity of the fact we're using the same
346 * descriptions for registers in align_16 as align_1:
347 */
348 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
349 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
350 reg.type == BRW_REGISTER_TYPE_DF &&
351 reg.vstride == BRW_VERTICAL_STRIDE_2) {
352 /* From SNB PRM:
353 *
354 * "For Align16 access mode, only encodings of 0000 and 0011
355 * are allowed. Other codes are reserved."
356 *
357 * Presumably the DevSNB behavior applies to IVB as well.
358 */
359 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
360 } else {
361 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
362 }
363 }
364 }
365 }
366 }
367
368 /**
369 * Specify the descriptor and extended descriptor immediate for a SEND(C)
370 * message instruction.
371 */
372 void
373 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
374 unsigned desc, unsigned ex_desc)
375 {
376 const struct gen_device_info *devinfo = p->devinfo;
377 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
378 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
379 brw_inst_set_src1_file_type(devinfo, inst,
380 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
381 brw_inst_set_send_desc(devinfo, inst, desc);
382 if (devinfo->gen >= 9)
383 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
384 }
385
386 static void brw_set_math_message( struct brw_codegen *p,
387 brw_inst *inst,
388 unsigned function,
389 unsigned integer_type,
390 bool low_precision,
391 unsigned dataType )
392 {
393 const struct gen_device_info *devinfo = p->devinfo;
394 unsigned msg_length;
395 unsigned response_length;
396
397 /* Infer message length from the function */
398 switch (function) {
399 case BRW_MATH_FUNCTION_POW:
400 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
401 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
402 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
403 msg_length = 2;
404 break;
405 default:
406 msg_length = 1;
407 break;
408 }
409
410 /* Infer response length from the function */
411 switch (function) {
412 case BRW_MATH_FUNCTION_SINCOS:
413 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
414 response_length = 2;
415 break;
416 default:
417 response_length = 1;
418 break;
419 }
420
421 brw_set_desc(p, inst, brw_message_desc(
422 devinfo, msg_length, response_length, false));
423
424 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
425 brw_inst_set_math_msg_function(devinfo, inst, function);
426 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
427 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
428 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
429 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
430 brw_inst_set_saturate(devinfo, inst, 0);
431 }
432
433
434 static void brw_set_ff_sync_message(struct brw_codegen *p,
435 brw_inst *insn,
436 bool allocate,
437 unsigned response_length,
438 bool end_of_thread)
439 {
440 const struct gen_device_info *devinfo = p->devinfo;
441
442 brw_set_desc(p, insn, brw_message_desc(
443 devinfo, 1, response_length, true));
444
445 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
446 brw_inst_set_eot(devinfo, insn, end_of_thread);
447 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
448 brw_inst_set_urb_allocate(devinfo, insn, allocate);
449 /* The following fields are not used by FF_SYNC: */
450 brw_inst_set_urb_global_offset(devinfo, insn, 0);
451 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
452 brw_inst_set_urb_used(devinfo, insn, 0);
453 brw_inst_set_urb_complete(devinfo, insn, 0);
454 }
455
456 static void brw_set_urb_message( struct brw_codegen *p,
457 brw_inst *insn,
458 enum brw_urb_write_flags flags,
459 unsigned msg_length,
460 unsigned response_length,
461 unsigned offset,
462 unsigned swizzle_control )
463 {
464 const struct gen_device_info *devinfo = p->devinfo;
465
466 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
467 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
468 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
469
470 brw_set_desc(p, insn, brw_message_desc(
471 devinfo, msg_length, response_length, true));
472
473 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
474 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
475
476 if (flags & BRW_URB_WRITE_OWORD) {
477 assert(msg_length == 2); /* header + one OWORD of data */
478 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
479 } else {
480 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
481 }
482
483 brw_inst_set_urb_global_offset(devinfo, insn, offset);
484 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
485
486 if (devinfo->gen < 8) {
487 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
488 }
489
490 if (devinfo->gen < 7) {
491 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
492 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
493 } else {
494 brw_inst_set_urb_per_slot_offset(devinfo, insn,
495 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
496 }
497 }
498
499 static void
500 gen7_set_dp_scratch_message(struct brw_codegen *p,
501 brw_inst *inst,
502 bool write,
503 bool dword,
504 bool invalidate_after_read,
505 unsigned num_regs,
506 unsigned addr_offset,
507 unsigned mlen,
508 unsigned rlen,
509 bool header_present)
510 {
511 const struct gen_device_info *devinfo = p->devinfo;
512 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
513 (devinfo->gen >= 8 && num_regs == 8));
514 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
515 num_regs - 1);
516
517 brw_set_desc(p, inst, brw_message_desc(
518 devinfo, mlen, rlen, header_present));
519
520 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
521 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
522 brw_inst_set_scratch_read_write(devinfo, inst, write);
523 brw_inst_set_scratch_type(devinfo, inst, dword);
524 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
525 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
526 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
527 }
528
529 static void
530 brw_inst_set_state(const struct gen_device_info *devinfo,
531 brw_inst *insn,
532 const struct brw_insn_state *state)
533 {
534 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
535 brw_inst_set_group(devinfo, insn, state->group);
536 brw_inst_set_compression(devinfo, insn, state->compressed);
537 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
538 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
539 brw_inst_set_saturate(devinfo, insn, state->saturate);
540 brw_inst_set_pred_control(devinfo, insn, state->predicate);
541 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
542
543 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
544 state->access_mode == BRW_ALIGN_16) {
545 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
546 if (devinfo->gen >= 7)
547 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
548 } else {
549 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
550 if (devinfo->gen >= 7)
551 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
552 }
553
554 if (devinfo->gen >= 6)
555 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
556 }
557
558 #define next_insn brw_next_insn
559 brw_inst *
560 brw_next_insn(struct brw_codegen *p, unsigned opcode)
561 {
562 const struct gen_device_info *devinfo = p->devinfo;
563 brw_inst *insn;
564
565 if (p->nr_insn + 1 > p->store_size) {
566 p->store_size <<= 1;
567 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
568 }
569
570 p->next_insn_offset += 16;
571 insn = &p->store[p->nr_insn++];
572
573 memset(insn, 0, sizeof(*insn));
574 brw_inst_set_opcode(devinfo, insn, opcode);
575
576 /* Apply the default instruction state */
577 brw_inst_set_state(devinfo, insn, p->current);
578
579 return insn;
580 }
581
582 static brw_inst *
583 brw_alu1(struct brw_codegen *p, unsigned opcode,
584 struct brw_reg dest, struct brw_reg src)
585 {
586 brw_inst *insn = next_insn(p, opcode);
587 brw_set_dest(p, insn, dest);
588 brw_set_src0(p, insn, src);
589 return insn;
590 }
591
592 static brw_inst *
593 brw_alu2(struct brw_codegen *p, unsigned opcode,
594 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
595 {
596 /* 64-bit immediates are only supported on 1-src instructions */
597 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
598 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
599
600 brw_inst *insn = next_insn(p, opcode);
601 brw_set_dest(p, insn, dest);
602 brw_set_src0(p, insn, src0);
603 brw_set_src1(p, insn, src1);
604 return insn;
605 }
606
607 static int
608 get_3src_subreg_nr(struct brw_reg reg)
609 {
610 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
611 * use 32-bit units (components 0..7). Since they only support F/D/UD
612 * types, this doesn't lose any flexibility, but uses fewer bits.
613 */
614 return reg.subnr / 4;
615 }
616
617 static enum gen10_align1_3src_vertical_stride
618 to_3src_align1_vstride(enum brw_vertical_stride vstride)
619 {
620 switch (vstride) {
621 case BRW_VERTICAL_STRIDE_0:
622 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
623 case BRW_VERTICAL_STRIDE_2:
624 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
625 case BRW_VERTICAL_STRIDE_4:
626 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
627 case BRW_VERTICAL_STRIDE_8:
628 case BRW_VERTICAL_STRIDE_16:
629 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
630 default:
631 unreachable("invalid vstride");
632 }
633 }
634
635
636 static enum gen10_align1_3src_src_horizontal_stride
637 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
638 {
639 switch (hstride) {
640 case BRW_HORIZONTAL_STRIDE_0:
641 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
642 case BRW_HORIZONTAL_STRIDE_1:
643 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
644 case BRW_HORIZONTAL_STRIDE_2:
645 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
646 case BRW_HORIZONTAL_STRIDE_4:
647 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
648 default:
649 unreachable("invalid hstride");
650 }
651 }
652
653 static brw_inst *
654 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
655 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
656 {
657 const struct gen_device_info *devinfo = p->devinfo;
658 brw_inst *inst = next_insn(p, opcode);
659
660 gen7_convert_mrf_to_grf(p, &dest);
661
662 assert(dest.nr < 128);
663 assert(src0.file != BRW_IMMEDIATE_VALUE || src0.nr < 128);
664 assert(src1.file != BRW_IMMEDIATE_VALUE || src1.nr < 128);
665 assert(src2.file != BRW_IMMEDIATE_VALUE || src2.nr < 128);
666 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
667 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
668 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
669 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
670
671 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
672 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
673 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
674
675 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
676 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
677 BRW_ALIGN1_3SRC_ACCUMULATOR);
678 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
679 } else {
680 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
681 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
682 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
683 }
684 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
685
686 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
687
688 if (brw_reg_type_is_floating_point(dest.type)) {
689 brw_inst_set_3src_a1_exec_type(devinfo, inst,
690 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
691 } else {
692 brw_inst_set_3src_a1_exec_type(devinfo, inst,
693 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
694 }
695
696 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
697 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
698 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
699 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
700
701 brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
702 to_3src_align1_vstride(src0.vstride));
703 brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
704 to_3src_align1_vstride(src1.vstride));
705 /* no vstride on src2 */
706
707 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
708 to_3src_align1_hstride(src0.hstride));
709 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
710 to_3src_align1_hstride(src1.hstride));
711 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
712 to_3src_align1_hstride(src2.hstride));
713
714 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
715 if (src0.type == BRW_REGISTER_TYPE_NF) {
716 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
717 } else {
718 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
719 }
720 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
721 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
722
723 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
724 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
725 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
726 } else {
727 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
728 }
729 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
730 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
731
732 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
733 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
734 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
735 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
736
737 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
738 src0.file == BRW_IMMEDIATE_VALUE ||
739 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
740 src0.type == BRW_REGISTER_TYPE_NF));
741 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
742 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
743 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
744 src2.file == BRW_IMMEDIATE_VALUE);
745
746 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
747 src0.file == BRW_GENERAL_REGISTER_FILE ?
748 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
749 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
750 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
751 src1.file == BRW_GENERAL_REGISTER_FILE ?
752 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
753 BRW_ALIGN1_3SRC_ACCUMULATOR);
754 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
755 src2.file == BRW_GENERAL_REGISTER_FILE ?
756 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
757 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
758 } else {
759 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
760 dest.file == BRW_MESSAGE_REGISTER_FILE);
761 assert(dest.type == BRW_REGISTER_TYPE_F ||
762 dest.type == BRW_REGISTER_TYPE_DF ||
763 dest.type == BRW_REGISTER_TYPE_D ||
764 dest.type == BRW_REGISTER_TYPE_UD);
765 if (devinfo->gen == 6) {
766 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
767 dest.file == BRW_MESSAGE_REGISTER_FILE);
768 }
769 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
770 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
771 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
772
773 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
774 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
775 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
776 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
777 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
778 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
779 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
780 src0.vstride == BRW_VERTICAL_STRIDE_0);
781
782 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
783 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
784 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
785 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
786 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
787 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
788 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
789 src1.vstride == BRW_VERTICAL_STRIDE_0);
790
791 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
792 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
793 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
794 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
795 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
796 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
797 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
798 src2.vstride == BRW_VERTICAL_STRIDE_0);
799
800 if (devinfo->gen >= 7) {
801 /* Set both the source and destination types based on dest.type,
802 * ignoring the source register types. The MAD and LRP emitters ensure
803 * that all four types are float. The BFE and BFI2 emitters, however,
804 * may send us mixed D and UD types and want us to ignore that and use
805 * the destination type.
806 */
807 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
808 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
809 }
810 }
811
812 return inst;
813 }
814
815
816 /***********************************************************************
817 * Convenience routines.
818 */
819 #define ALU1(OP) \
820 brw_inst *brw_##OP(struct brw_codegen *p, \
821 struct brw_reg dest, \
822 struct brw_reg src0) \
823 { \
824 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
825 }
826
827 #define ALU2(OP) \
828 brw_inst *brw_##OP(struct brw_codegen *p, \
829 struct brw_reg dest, \
830 struct brw_reg src0, \
831 struct brw_reg src1) \
832 { \
833 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
834 }
835
836 #define ALU3(OP) \
837 brw_inst *brw_##OP(struct brw_codegen *p, \
838 struct brw_reg dest, \
839 struct brw_reg src0, \
840 struct brw_reg src1, \
841 struct brw_reg src2) \
842 { \
843 if (p->current->access_mode == BRW_ALIGN_16) { \
844 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
845 src0.swizzle = BRW_SWIZZLE_XXXX; \
846 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
847 src1.swizzle = BRW_SWIZZLE_XXXX; \
848 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
849 src2.swizzle = BRW_SWIZZLE_XXXX; \
850 } \
851 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
852 }
853
854 #define ALU3F(OP) \
855 brw_inst *brw_##OP(struct brw_codegen *p, \
856 struct brw_reg dest, \
857 struct brw_reg src0, \
858 struct brw_reg src1, \
859 struct brw_reg src2) \
860 { \
861 assert(dest.type == BRW_REGISTER_TYPE_F || \
862 dest.type == BRW_REGISTER_TYPE_DF); \
863 if (dest.type == BRW_REGISTER_TYPE_F) { \
864 assert(src0.type == BRW_REGISTER_TYPE_F); \
865 assert(src1.type == BRW_REGISTER_TYPE_F); \
866 assert(src2.type == BRW_REGISTER_TYPE_F); \
867 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
868 assert(src0.type == BRW_REGISTER_TYPE_DF); \
869 assert(src1.type == BRW_REGISTER_TYPE_DF); \
870 assert(src2.type == BRW_REGISTER_TYPE_DF); \
871 } \
872 \
873 if (p->current->access_mode == BRW_ALIGN_16) { \
874 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
875 src0.swizzle = BRW_SWIZZLE_XXXX; \
876 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
877 src1.swizzle = BRW_SWIZZLE_XXXX; \
878 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
879 src2.swizzle = BRW_SWIZZLE_XXXX; \
880 } \
881 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
882 }
883
884 /* Rounding operations (other than RNDD) require two instructions - the first
885 * stores a rounded value (possibly the wrong way) in the dest register, but
886 * also sets a per-channel "increment bit" in the flag register. A predicated
887 * add of 1.0 fixes dest to contain the desired result.
888 *
889 * Sandybridge and later appear to round correctly without an ADD.
890 */
891 #define ROUND(OP) \
892 void brw_##OP(struct brw_codegen *p, \
893 struct brw_reg dest, \
894 struct brw_reg src) \
895 { \
896 const struct gen_device_info *devinfo = p->devinfo; \
897 brw_inst *rnd, *add; \
898 rnd = next_insn(p, BRW_OPCODE_##OP); \
899 brw_set_dest(p, rnd, dest); \
900 brw_set_src0(p, rnd, src); \
901 \
902 if (devinfo->gen < 6) { \
903 /* turn on round-increments */ \
904 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
905 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
906 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
907 } \
908 }
909
910
911 ALU2(SEL)
912 ALU1(NOT)
913 ALU2(AND)
914 ALU2(OR)
915 ALU2(XOR)
916 ALU2(SHR)
917 ALU2(SHL)
918 ALU1(DIM)
919 ALU2(ASR)
920 ALU3(CSEL)
921 ALU1(FRC)
922 ALU1(RNDD)
923 ALU2(MAC)
924 ALU2(MACH)
925 ALU1(LZD)
926 ALU2(DP4)
927 ALU2(DPH)
928 ALU2(DP3)
929 ALU2(DP2)
930 ALU3(MAD)
931 ALU3F(LRP)
932 ALU1(BFREV)
933 ALU3(BFE)
934 ALU2(BFI1)
935 ALU3(BFI2)
936 ALU1(FBH)
937 ALU1(FBL)
938 ALU1(CBIT)
939 ALU2(ADDC)
940 ALU2(SUBB)
941
942 ROUND(RNDZ)
943 ROUND(RNDE)
944
945 brw_inst *
946 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
947 {
948 const struct gen_device_info *devinfo = p->devinfo;
949
950 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
951 * To avoid the problems that causes, we use an <X,2,0> source region to
952 * read each element twice.
953 */
954 if (devinfo->gen == 7 && !devinfo->is_haswell &&
955 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
956 dest.type == BRW_REGISTER_TYPE_DF &&
957 (src0.type == BRW_REGISTER_TYPE_F ||
958 src0.type == BRW_REGISTER_TYPE_D ||
959 src0.type == BRW_REGISTER_TYPE_UD) &&
960 !has_scalar_region(src0)) {
961 assert(src0.vstride == src0.width + src0.hstride);
962 src0.vstride = src0.hstride;
963 src0.width = BRW_WIDTH_2;
964 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
965 }
966
967 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
968 }
969
970 brw_inst *
971 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
972 struct brw_reg src0, struct brw_reg src1)
973 {
974 /* 6.2.2: add */
975 if (src0.type == BRW_REGISTER_TYPE_F ||
976 (src0.file == BRW_IMMEDIATE_VALUE &&
977 src0.type == BRW_REGISTER_TYPE_VF)) {
978 assert(src1.type != BRW_REGISTER_TYPE_UD);
979 assert(src1.type != BRW_REGISTER_TYPE_D);
980 }
981
982 if (src1.type == BRW_REGISTER_TYPE_F ||
983 (src1.file == BRW_IMMEDIATE_VALUE &&
984 src1.type == BRW_REGISTER_TYPE_VF)) {
985 assert(src0.type != BRW_REGISTER_TYPE_UD);
986 assert(src0.type != BRW_REGISTER_TYPE_D);
987 }
988
989 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
990 }
991
992 brw_inst *
993 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
994 struct brw_reg src0, struct brw_reg src1)
995 {
996 assert(dest.type == src0.type);
997 assert(src0.type == src1.type);
998 switch (src0.type) {
999 case BRW_REGISTER_TYPE_B:
1000 case BRW_REGISTER_TYPE_UB:
1001 case BRW_REGISTER_TYPE_W:
1002 case BRW_REGISTER_TYPE_UW:
1003 case BRW_REGISTER_TYPE_D:
1004 case BRW_REGISTER_TYPE_UD:
1005 break;
1006 default:
1007 unreachable("Bad type for brw_AVG");
1008 }
1009
1010 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1011 }
1012
1013 brw_inst *
1014 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1015 struct brw_reg src0, struct brw_reg src1)
1016 {
1017 /* 6.32.38: mul */
1018 if (src0.type == BRW_REGISTER_TYPE_D ||
1019 src0.type == BRW_REGISTER_TYPE_UD ||
1020 src1.type == BRW_REGISTER_TYPE_D ||
1021 src1.type == BRW_REGISTER_TYPE_UD) {
1022 assert(dest.type != BRW_REGISTER_TYPE_F);
1023 }
1024
1025 if (src0.type == BRW_REGISTER_TYPE_F ||
1026 (src0.file == BRW_IMMEDIATE_VALUE &&
1027 src0.type == BRW_REGISTER_TYPE_VF)) {
1028 assert(src1.type != BRW_REGISTER_TYPE_UD);
1029 assert(src1.type != BRW_REGISTER_TYPE_D);
1030 }
1031
1032 if (src1.type == BRW_REGISTER_TYPE_F ||
1033 (src1.file == BRW_IMMEDIATE_VALUE &&
1034 src1.type == BRW_REGISTER_TYPE_VF)) {
1035 assert(src0.type != BRW_REGISTER_TYPE_UD);
1036 assert(src0.type != BRW_REGISTER_TYPE_D);
1037 }
1038
1039 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1040 src0.nr != BRW_ARF_ACCUMULATOR);
1041 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1042 src1.nr != BRW_ARF_ACCUMULATOR);
1043
1044 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1045 }
1046
1047 brw_inst *
1048 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1049 struct brw_reg src0, struct brw_reg src1)
1050 {
1051 src0.vstride = BRW_VERTICAL_STRIDE_0;
1052 src0.width = BRW_WIDTH_1;
1053 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1054 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1055 }
1056
1057 brw_inst *
1058 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1059 struct brw_reg src0, struct brw_reg src1)
1060 {
1061 src0.vstride = BRW_VERTICAL_STRIDE_0;
1062 src0.width = BRW_WIDTH_1;
1063 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1064 src1.vstride = BRW_VERTICAL_STRIDE_8;
1065 src1.width = BRW_WIDTH_8;
1066 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1067 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1068 }
1069
1070 brw_inst *
1071 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1072 {
1073 const struct gen_device_info *devinfo = p->devinfo;
1074 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1075 /* The F32TO16 instruction doesn't support 32-bit destination types in
1076 * Align1 mode, and neither does the Gen8 implementation in terms of a
1077 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1078 * an undocumented feature.
1079 */
1080 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1081 (!align16 || devinfo->gen >= 8));
1082 brw_inst *inst;
1083
1084 if (align16) {
1085 assert(dst.type == BRW_REGISTER_TYPE_UD);
1086 } else {
1087 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1088 dst.type == BRW_REGISTER_TYPE_W ||
1089 dst.type == BRW_REGISTER_TYPE_UW ||
1090 dst.type == BRW_REGISTER_TYPE_HF);
1091 }
1092
1093 brw_push_insn_state(p);
1094
1095 if (needs_zero_fill) {
1096 brw_set_default_access_mode(p, BRW_ALIGN_1);
1097 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1098 }
1099
1100 if (devinfo->gen >= 8) {
1101 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1102 } else {
1103 assert(devinfo->gen == 7);
1104 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1105 }
1106
1107 if (needs_zero_fill) {
1108 brw_inst_set_no_dd_clear(devinfo, inst, true);
1109 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1110 brw_inst_set_no_dd_check(devinfo, inst, true);
1111 }
1112
1113 brw_pop_insn_state(p);
1114 return inst;
1115 }
1116
1117 brw_inst *
1118 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1119 {
1120 const struct gen_device_info *devinfo = p->devinfo;
1121 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1122
1123 if (align16) {
1124 assert(src.type == BRW_REGISTER_TYPE_UD);
1125 } else {
1126 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1127 *
1128 * Because this instruction does not have a 16-bit floating-point
1129 * type, the source data type must be Word (W). The destination type
1130 * must be F (Float).
1131 */
1132 if (src.type == BRW_REGISTER_TYPE_UD)
1133 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1134
1135 assert(src.type == BRW_REGISTER_TYPE_W ||
1136 src.type == BRW_REGISTER_TYPE_UW ||
1137 src.type == BRW_REGISTER_TYPE_HF);
1138 }
1139
1140 if (devinfo->gen >= 8) {
1141 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1142 } else {
1143 assert(devinfo->gen == 7);
1144 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1145 }
1146 }
1147
1148
1149 void brw_NOP(struct brw_codegen *p)
1150 {
1151 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1152 memset(insn, 0, sizeof(*insn));
1153 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1154 }
1155
1156
1157
1158
1159
1160 /***********************************************************************
1161 * Comparisons, if/else/endif
1162 */
1163
1164 brw_inst *
1165 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1166 unsigned predicate_control)
1167 {
1168 const struct gen_device_info *devinfo = p->devinfo;
1169 struct brw_reg ip = brw_ip_reg();
1170 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1171
1172 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1173 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1174 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1175 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1176
1177 return inst;
1178 }
1179
1180 static void
1181 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1182 {
1183 p->if_stack[p->if_stack_depth] = inst - p->store;
1184
1185 p->if_stack_depth++;
1186 if (p->if_stack_array_size <= p->if_stack_depth) {
1187 p->if_stack_array_size *= 2;
1188 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1189 p->if_stack_array_size);
1190 }
1191 }
1192
1193 static brw_inst *
1194 pop_if_stack(struct brw_codegen *p)
1195 {
1196 p->if_stack_depth--;
1197 return &p->store[p->if_stack[p->if_stack_depth]];
1198 }
1199
1200 static void
1201 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1202 {
1203 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1204 p->loop_stack_array_size *= 2;
1205 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1206 p->loop_stack_array_size);
1207 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1208 p->loop_stack_array_size);
1209 }
1210
1211 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1212 p->loop_stack_depth++;
1213 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1214 }
1215
1216 static brw_inst *
1217 get_inner_do_insn(struct brw_codegen *p)
1218 {
1219 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1220 }
1221
1222 /* EU takes the value from the flag register and pushes it onto some
1223 * sort of a stack (presumably merging with any flag value already on
1224 * the stack). Within an if block, the flags at the top of the stack
1225 * control execution on each channel of the unit, eg. on each of the
1226 * 16 pixel values in our wm programs.
1227 *
1228 * When the matching 'else' instruction is reached (presumably by
1229 * countdown of the instruction count patched in by our ELSE/ENDIF
1230 * functions), the relevant flags are inverted.
1231 *
1232 * When the matching 'endif' instruction is reached, the flags are
1233 * popped off. If the stack is now empty, normal execution resumes.
1234 */
1235 brw_inst *
1236 brw_IF(struct brw_codegen *p, unsigned execute_size)
1237 {
1238 const struct gen_device_info *devinfo = p->devinfo;
1239 brw_inst *insn;
1240
1241 insn = next_insn(p, BRW_OPCODE_IF);
1242
1243 /* Override the defaults for this instruction:
1244 */
1245 if (devinfo->gen < 6) {
1246 brw_set_dest(p, insn, brw_ip_reg());
1247 brw_set_src0(p, insn, brw_ip_reg());
1248 brw_set_src1(p, insn, brw_imm_d(0x0));
1249 } else if (devinfo->gen == 6) {
1250 brw_set_dest(p, insn, brw_imm_w(0));
1251 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1252 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1253 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1254 } else if (devinfo->gen == 7) {
1255 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1256 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1257 brw_set_src1(p, insn, brw_imm_w(0));
1258 brw_inst_set_jip(devinfo, insn, 0);
1259 brw_inst_set_uip(devinfo, insn, 0);
1260 } else {
1261 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1262 brw_set_src0(p, insn, brw_imm_d(0));
1263 brw_inst_set_jip(devinfo, insn, 0);
1264 brw_inst_set_uip(devinfo, insn, 0);
1265 }
1266
1267 brw_inst_set_exec_size(devinfo, insn, execute_size);
1268 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1269 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1270 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1271 if (!p->single_program_flow && devinfo->gen < 6)
1272 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1273
1274 push_if_stack(p, insn);
1275 p->if_depth_in_loop[p->loop_stack_depth]++;
1276 return insn;
1277 }
1278
1279 /* This function is only used for gen6-style IF instructions with an
1280 * embedded comparison (conditional modifier). It is not used on gen7.
1281 */
1282 brw_inst *
1283 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1284 struct brw_reg src0, struct brw_reg src1)
1285 {
1286 const struct gen_device_info *devinfo = p->devinfo;
1287 brw_inst *insn;
1288
1289 insn = next_insn(p, BRW_OPCODE_IF);
1290
1291 brw_set_dest(p, insn, brw_imm_w(0));
1292 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1293 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1294 brw_set_src0(p, insn, src0);
1295 brw_set_src1(p, insn, src1);
1296
1297 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1298 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1299 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1300
1301 push_if_stack(p, insn);
1302 return insn;
1303 }
1304
1305 /**
1306 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1307 */
1308 static void
1309 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1310 brw_inst *if_inst, brw_inst *else_inst)
1311 {
1312 const struct gen_device_info *devinfo = p->devinfo;
1313
1314 /* The next instruction (where the ENDIF would be, if it existed) */
1315 brw_inst *next_inst = &p->store[p->nr_insn];
1316
1317 assert(p->single_program_flow);
1318 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1319 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1320 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1321
1322 /* Convert IF to an ADD instruction that moves the instruction pointer
1323 * to the first instruction of the ELSE block. If there is no ELSE
1324 * block, point to where ENDIF would be. Reverse the predicate.
1325 *
1326 * There's no need to execute an ENDIF since we don't need to do any
1327 * stack operations, and if we're currently executing, we just want to
1328 * continue normally.
1329 */
1330 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1331 brw_inst_set_pred_inv(devinfo, if_inst, true);
1332
1333 if (else_inst != NULL) {
1334 /* Convert ELSE to an ADD instruction that points where the ENDIF
1335 * would be.
1336 */
1337 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1338
1339 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1340 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1341 } else {
1342 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1343 }
1344 }
1345
1346 /**
1347 * Patch IF and ELSE instructions with appropriate jump targets.
1348 */
1349 static void
1350 patch_IF_ELSE(struct brw_codegen *p,
1351 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1352 {
1353 const struct gen_device_info *devinfo = p->devinfo;
1354
1355 /* We shouldn't be patching IF and ELSE instructions in single program flow
1356 * mode when gen < 6, because in single program flow mode on those
1357 * platforms, we convert flow control instructions to conditional ADDs that
1358 * operate on IP (see brw_ENDIF).
1359 *
1360 * However, on Gen6, writing to IP doesn't work in single program flow mode
1361 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1362 * not be updated by non-flow control instructions."). And on later
1363 * platforms, there is no significant benefit to converting control flow
1364 * instructions to conditional ADDs. So we do patch IF and ELSE
1365 * instructions in single program flow mode on those platforms.
1366 */
1367 if (devinfo->gen < 6)
1368 assert(!p->single_program_flow);
1369
1370 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1371 assert(endif_inst != NULL);
1372 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1373
1374 unsigned br = brw_jump_scale(devinfo);
1375
1376 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1377 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1378
1379 if (else_inst == NULL) {
1380 /* Patch IF -> ENDIF */
1381 if (devinfo->gen < 6) {
1382 /* Turn it into an IFF, which means no mask stack operations for
1383 * all-false and jumping past the ENDIF.
1384 */
1385 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1386 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1387 br * (endif_inst - if_inst + 1));
1388 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1389 } else if (devinfo->gen == 6) {
1390 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1391 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1392 } else {
1393 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1394 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1395 }
1396 } else {
1397 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1398
1399 /* Patch IF -> ELSE */
1400 if (devinfo->gen < 6) {
1401 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1402 br * (else_inst - if_inst));
1403 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1404 } else if (devinfo->gen == 6) {
1405 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1406 br * (else_inst - if_inst + 1));
1407 }
1408
1409 /* Patch ELSE -> ENDIF */
1410 if (devinfo->gen < 6) {
1411 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1412 * matching ENDIF.
1413 */
1414 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1415 br * (endif_inst - else_inst + 1));
1416 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1417 } else if (devinfo->gen == 6) {
1418 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1419 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1420 br * (endif_inst - else_inst));
1421 } else {
1422 /* The IF instruction's JIP should point just past the ELSE */
1423 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1424 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1425 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1426 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1427 if (devinfo->gen >= 8) {
1428 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1429 * should point to ENDIF.
1430 */
1431 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1432 }
1433 }
1434 }
1435 }
1436
1437 void
1438 brw_ELSE(struct brw_codegen *p)
1439 {
1440 const struct gen_device_info *devinfo = p->devinfo;
1441 brw_inst *insn;
1442
1443 insn = next_insn(p, BRW_OPCODE_ELSE);
1444
1445 if (devinfo->gen < 6) {
1446 brw_set_dest(p, insn, brw_ip_reg());
1447 brw_set_src0(p, insn, brw_ip_reg());
1448 brw_set_src1(p, insn, brw_imm_d(0x0));
1449 } else if (devinfo->gen == 6) {
1450 brw_set_dest(p, insn, brw_imm_w(0));
1451 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1452 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1453 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1454 } else if (devinfo->gen == 7) {
1455 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1456 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1457 brw_set_src1(p, insn, brw_imm_w(0));
1458 brw_inst_set_jip(devinfo, insn, 0);
1459 brw_inst_set_uip(devinfo, insn, 0);
1460 } else {
1461 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1462 brw_set_src0(p, insn, brw_imm_d(0));
1463 brw_inst_set_jip(devinfo, insn, 0);
1464 brw_inst_set_uip(devinfo, insn, 0);
1465 }
1466
1467 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1468 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1469 if (!p->single_program_flow && devinfo->gen < 6)
1470 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1471
1472 push_if_stack(p, insn);
1473 }
1474
1475 void
1476 brw_ENDIF(struct brw_codegen *p)
1477 {
1478 const struct gen_device_info *devinfo = p->devinfo;
1479 brw_inst *insn = NULL;
1480 brw_inst *else_inst = NULL;
1481 brw_inst *if_inst = NULL;
1482 brw_inst *tmp;
1483 bool emit_endif = true;
1484
1485 /* In single program flow mode, we can express IF and ELSE instructions
1486 * equivalently as ADD instructions that operate on IP. On platforms prior
1487 * to Gen6, flow control instructions cause an implied thread switch, so
1488 * this is a significant savings.
1489 *
1490 * However, on Gen6, writing to IP doesn't work in single program flow mode
1491 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1492 * not be updated by non-flow control instructions."). And on later
1493 * platforms, there is no significant benefit to converting control flow
1494 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1495 * Gen5.
1496 */
1497 if (devinfo->gen < 6 && p->single_program_flow)
1498 emit_endif = false;
1499
1500 /*
1501 * A single next_insn() may change the base address of instruction store
1502 * memory(p->store), so call it first before referencing the instruction
1503 * store pointer from an index
1504 */
1505 if (emit_endif)
1506 insn = next_insn(p, BRW_OPCODE_ENDIF);
1507
1508 /* Pop the IF and (optional) ELSE instructions from the stack */
1509 p->if_depth_in_loop[p->loop_stack_depth]--;
1510 tmp = pop_if_stack(p);
1511 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1512 else_inst = tmp;
1513 tmp = pop_if_stack(p);
1514 }
1515 if_inst = tmp;
1516
1517 if (!emit_endif) {
1518 /* ENDIF is useless; don't bother emitting it. */
1519 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1520 return;
1521 }
1522
1523 if (devinfo->gen < 6) {
1524 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1525 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1526 brw_set_src1(p, insn, brw_imm_d(0x0));
1527 } else if (devinfo->gen == 6) {
1528 brw_set_dest(p, insn, brw_imm_w(0));
1529 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1530 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1531 } else if (devinfo->gen == 7) {
1532 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1533 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1534 brw_set_src1(p, insn, brw_imm_w(0));
1535 } else {
1536 brw_set_src0(p, insn, brw_imm_d(0));
1537 }
1538
1539 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1540 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1541 if (devinfo->gen < 6)
1542 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1543
1544 /* Also pop item off the stack in the endif instruction: */
1545 if (devinfo->gen < 6) {
1546 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1547 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1548 } else if (devinfo->gen == 6) {
1549 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1550 } else {
1551 brw_inst_set_jip(devinfo, insn, 2);
1552 }
1553 patch_IF_ELSE(p, if_inst, else_inst, insn);
1554 }
1555
1556 brw_inst *
1557 brw_BREAK(struct brw_codegen *p)
1558 {
1559 const struct gen_device_info *devinfo = p->devinfo;
1560 brw_inst *insn;
1561
1562 insn = next_insn(p, BRW_OPCODE_BREAK);
1563 if (devinfo->gen >= 8) {
1564 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1565 brw_set_src0(p, insn, brw_imm_d(0x0));
1566 } else if (devinfo->gen >= 6) {
1567 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1568 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1569 brw_set_src1(p, insn, brw_imm_d(0x0));
1570 } else {
1571 brw_set_dest(p, insn, brw_ip_reg());
1572 brw_set_src0(p, insn, brw_ip_reg());
1573 brw_set_src1(p, insn, brw_imm_d(0x0));
1574 brw_inst_set_gen4_pop_count(devinfo, insn,
1575 p->if_depth_in_loop[p->loop_stack_depth]);
1576 }
1577 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1578 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1579
1580 return insn;
1581 }
1582
1583 brw_inst *
1584 brw_CONT(struct brw_codegen *p)
1585 {
1586 const struct gen_device_info *devinfo = p->devinfo;
1587 brw_inst *insn;
1588
1589 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1590 brw_set_dest(p, insn, brw_ip_reg());
1591 if (devinfo->gen >= 8) {
1592 brw_set_src0(p, insn, brw_imm_d(0x0));
1593 } else {
1594 brw_set_src0(p, insn, brw_ip_reg());
1595 brw_set_src1(p, insn, brw_imm_d(0x0));
1596 }
1597
1598 if (devinfo->gen < 6) {
1599 brw_inst_set_gen4_pop_count(devinfo, insn,
1600 p->if_depth_in_loop[p->loop_stack_depth]);
1601 }
1602 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1603 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1604 return insn;
1605 }
1606
1607 brw_inst *
1608 gen6_HALT(struct brw_codegen *p)
1609 {
1610 const struct gen_device_info *devinfo = p->devinfo;
1611 brw_inst *insn;
1612
1613 insn = next_insn(p, BRW_OPCODE_HALT);
1614 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1615 if (devinfo->gen >= 8) {
1616 brw_set_src0(p, insn, brw_imm_d(0x0));
1617 } else {
1618 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1619 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1620 }
1621
1622 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1623 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1624 return insn;
1625 }
1626
1627 /* DO/WHILE loop:
1628 *
1629 * The DO/WHILE is just an unterminated loop -- break or continue are
1630 * used for control within the loop. We have a few ways they can be
1631 * done.
1632 *
1633 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1634 * jip and no DO instruction.
1635 *
1636 * For non-uniform control flow pre-gen6, there's a DO instruction to
1637 * push the mask, and a WHILE to jump back, and BREAK to get out and
1638 * pop the mask.
1639 *
1640 * For gen6, there's no more mask stack, so no need for DO. WHILE
1641 * just points back to the first instruction of the loop.
1642 */
1643 brw_inst *
1644 brw_DO(struct brw_codegen *p, unsigned execute_size)
1645 {
1646 const struct gen_device_info *devinfo = p->devinfo;
1647
1648 if (devinfo->gen >= 6 || p->single_program_flow) {
1649 push_loop_stack(p, &p->store[p->nr_insn]);
1650 return &p->store[p->nr_insn];
1651 } else {
1652 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1653
1654 push_loop_stack(p, insn);
1655
1656 /* Override the defaults for this instruction:
1657 */
1658 brw_set_dest(p, insn, brw_null_reg());
1659 brw_set_src0(p, insn, brw_null_reg());
1660 brw_set_src1(p, insn, brw_null_reg());
1661
1662 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1663 brw_inst_set_exec_size(devinfo, insn, execute_size);
1664 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1665
1666 return insn;
1667 }
1668 }
1669
1670 /**
1671 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1672 * instruction here.
1673 *
1674 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1675 * nesting, since it can always just point to the end of the block/current loop.
1676 */
1677 static void
1678 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1679 {
1680 const struct gen_device_info *devinfo = p->devinfo;
1681 brw_inst *do_inst = get_inner_do_insn(p);
1682 brw_inst *inst;
1683 unsigned br = brw_jump_scale(devinfo);
1684
1685 assert(devinfo->gen < 6);
1686
1687 for (inst = while_inst - 1; inst != do_inst; inst--) {
1688 /* If the jump count is != 0, that means that this instruction has already
1689 * been patched because it's part of a loop inside of the one we're
1690 * patching.
1691 */
1692 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1693 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1694 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1695 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1696 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1697 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1698 }
1699 }
1700 }
1701
1702 brw_inst *
1703 brw_WHILE(struct brw_codegen *p)
1704 {
1705 const struct gen_device_info *devinfo = p->devinfo;
1706 brw_inst *insn, *do_insn;
1707 unsigned br = brw_jump_scale(devinfo);
1708
1709 if (devinfo->gen >= 6) {
1710 insn = next_insn(p, BRW_OPCODE_WHILE);
1711 do_insn = get_inner_do_insn(p);
1712
1713 if (devinfo->gen >= 8) {
1714 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1715 brw_set_src0(p, insn, brw_imm_d(0));
1716 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1717 } else if (devinfo->gen == 7) {
1718 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1719 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1720 brw_set_src1(p, insn, brw_imm_w(0));
1721 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1722 } else {
1723 brw_set_dest(p, insn, brw_imm_w(0));
1724 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1725 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1726 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1727 }
1728
1729 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1730
1731 } else {
1732 if (p->single_program_flow) {
1733 insn = next_insn(p, BRW_OPCODE_ADD);
1734 do_insn = get_inner_do_insn(p);
1735
1736 brw_set_dest(p, insn, brw_ip_reg());
1737 brw_set_src0(p, insn, brw_ip_reg());
1738 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1739 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1740 } else {
1741 insn = next_insn(p, BRW_OPCODE_WHILE);
1742 do_insn = get_inner_do_insn(p);
1743
1744 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1745
1746 brw_set_dest(p, insn, brw_ip_reg());
1747 brw_set_src0(p, insn, brw_ip_reg());
1748 brw_set_src1(p, insn, brw_imm_d(0));
1749
1750 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1751 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1752 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1753
1754 brw_patch_break_cont(p, insn);
1755 }
1756 }
1757 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1758
1759 p->loop_stack_depth--;
1760
1761 return insn;
1762 }
1763
1764 /* FORWARD JUMPS:
1765 */
1766 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1767 {
1768 const struct gen_device_info *devinfo = p->devinfo;
1769 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1770 unsigned jmpi = 1;
1771
1772 if (devinfo->gen >= 5)
1773 jmpi = 2;
1774
1775 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1776 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1777
1778 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1779 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1780 }
1781
1782 /* To integrate with the above, it makes sense that the comparison
1783 * instruction should populate the flag register. It might be simpler
1784 * just to use the flag reg for most WM tasks?
1785 */
1786 void brw_CMP(struct brw_codegen *p,
1787 struct brw_reg dest,
1788 unsigned conditional,
1789 struct brw_reg src0,
1790 struct brw_reg src1)
1791 {
1792 const struct gen_device_info *devinfo = p->devinfo;
1793 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1794
1795 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1796 brw_set_dest(p, insn, dest);
1797 brw_set_src0(p, insn, src0);
1798 brw_set_src1(p, insn, src1);
1799
1800 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1801 * page says:
1802 * "Any CMP instruction with a null destination must use a {switch}."
1803 *
1804 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1805 * mentioned on their work-arounds pages.
1806 */
1807 if (devinfo->gen == 7) {
1808 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1809 dest.nr == BRW_ARF_NULL) {
1810 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1811 }
1812 }
1813 }
1814
1815 /***********************************************************************
1816 * Helpers for the various SEND message types:
1817 */
1818
1819 /** Extended math function, float[8].
1820 */
1821 void gen4_math(struct brw_codegen *p,
1822 struct brw_reg dest,
1823 unsigned function,
1824 unsigned msg_reg_nr,
1825 struct brw_reg src,
1826 unsigned precision )
1827 {
1828 const struct gen_device_info *devinfo = p->devinfo;
1829 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1830 unsigned data_type;
1831 if (has_scalar_region(src)) {
1832 data_type = BRW_MATH_DATA_SCALAR;
1833 } else {
1834 data_type = BRW_MATH_DATA_VECTOR;
1835 }
1836
1837 assert(devinfo->gen < 6);
1838
1839 /* Example code doesn't set predicate_control for send
1840 * instructions.
1841 */
1842 brw_inst_set_pred_control(devinfo, insn, 0);
1843 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1844
1845 brw_set_dest(p, insn, dest);
1846 brw_set_src0(p, insn, src);
1847 brw_set_math_message(p,
1848 insn,
1849 function,
1850 src.type == BRW_REGISTER_TYPE_D,
1851 precision,
1852 data_type);
1853 }
1854
1855 void gen6_math(struct brw_codegen *p,
1856 struct brw_reg dest,
1857 unsigned function,
1858 struct brw_reg src0,
1859 struct brw_reg src1)
1860 {
1861 const struct gen_device_info *devinfo = p->devinfo;
1862 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1863
1864 assert(devinfo->gen >= 6);
1865
1866 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1867 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1868
1869 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1870 if (devinfo->gen == 6) {
1871 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1872 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1873 }
1874
1875 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1876 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1877 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1878 assert(src0.type != BRW_REGISTER_TYPE_F);
1879 assert(src1.type != BRW_REGISTER_TYPE_F);
1880 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1881 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1882 } else {
1883 assert(src0.type == BRW_REGISTER_TYPE_F);
1884 assert(src1.type == BRW_REGISTER_TYPE_F);
1885 }
1886
1887 /* Source modifiers are ignored for extended math instructions on Gen6. */
1888 if (devinfo->gen == 6) {
1889 assert(!src0.negate);
1890 assert(!src0.abs);
1891 assert(!src1.negate);
1892 assert(!src1.abs);
1893 }
1894
1895 brw_inst_set_math_function(devinfo, insn, function);
1896
1897 brw_set_dest(p, insn, dest);
1898 brw_set_src0(p, insn, src0);
1899 brw_set_src1(p, insn, src1);
1900 }
1901
1902 /**
1903 * Return the right surface index to access the thread scratch space using
1904 * stateless dataport messages.
1905 */
1906 unsigned
1907 brw_scratch_surface_idx(const struct brw_codegen *p)
1908 {
1909 /* The scratch space is thread-local so IA coherency is unnecessary. */
1910 if (p->devinfo->gen >= 8)
1911 return GEN8_BTI_STATELESS_NON_COHERENT;
1912 else
1913 return BRW_BTI_STATELESS;
1914 }
1915
1916 /**
1917 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1918 * using a constant offset per channel.
1919 *
1920 * The offset must be aligned to oword size (16 bytes). Used for
1921 * register spilling.
1922 */
1923 void brw_oword_block_write_scratch(struct brw_codegen *p,
1924 struct brw_reg mrf,
1925 int num_regs,
1926 unsigned offset)
1927 {
1928 const struct gen_device_info *devinfo = p->devinfo;
1929 const unsigned target_cache =
1930 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1931 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1932 BRW_SFID_DATAPORT_WRITE);
1933 uint32_t msg_type;
1934
1935 if (devinfo->gen >= 6)
1936 offset /= 16;
1937
1938 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1939
1940 const unsigned mlen = 1 + num_regs;
1941
1942 /* Set up the message header. This is g0, with g0.2 filled with
1943 * the offset. We don't want to leave our offset around in g0 or
1944 * it'll screw up texture samples, so set it up inside the message
1945 * reg.
1946 */
1947 {
1948 brw_push_insn_state(p);
1949 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1950 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1951 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1952
1953 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1954
1955 /* set message header global offset field (reg 0, element 2) */
1956 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1957 brw_MOV(p,
1958 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1959 mrf.nr,
1960 2), BRW_REGISTER_TYPE_UD),
1961 brw_imm_ud(offset));
1962
1963 brw_pop_insn_state(p);
1964 }
1965
1966 {
1967 struct brw_reg dest;
1968 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1969 int send_commit_msg;
1970 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1971 BRW_REGISTER_TYPE_UW);
1972
1973 brw_inst_set_sfid(devinfo, insn, target_cache);
1974 brw_inst_set_compression(devinfo, insn, false);
1975
1976 if (brw_inst_exec_size(devinfo, insn) >= 16)
1977 src_header = vec16(src_header);
1978
1979 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1980 if (devinfo->gen < 6)
1981 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
1982
1983 /* Until gen6, writes followed by reads from the same location
1984 * are not guaranteed to be ordered unless write_commit is set.
1985 * If set, then a no-op write is issued to the destination
1986 * register to set a dependency, and a read from the destination
1987 * can be used to ensure the ordering.
1988 *
1989 * For gen6, only writes between different threads need ordering
1990 * protection. Our use of DP writes is all about register
1991 * spilling within a thread.
1992 */
1993 if (devinfo->gen >= 6) {
1994 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1995 send_commit_msg = 0;
1996 } else {
1997 dest = src_header;
1998 send_commit_msg = 1;
1999 }
2000
2001 brw_set_dest(p, insn, dest);
2002 if (devinfo->gen >= 6) {
2003 brw_set_src0(p, insn, mrf);
2004 } else {
2005 brw_set_src0(p, insn, brw_null_reg());
2006 }
2007
2008 if (devinfo->gen >= 6)
2009 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2010 else
2011 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2012
2013 brw_set_desc(p, insn,
2014 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2015 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2016 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2017 msg_type, 0, /* not a render target */
2018 send_commit_msg));
2019 }
2020 }
2021
2022
2023 /**
2024 * Read a block of owords (half a GRF each) from the scratch buffer
2025 * using a constant index per channel.
2026 *
2027 * Offset must be aligned to oword size (16 bytes). Used for register
2028 * spilling.
2029 */
2030 void
2031 brw_oword_block_read_scratch(struct brw_codegen *p,
2032 struct brw_reg dest,
2033 struct brw_reg mrf,
2034 int num_regs,
2035 unsigned offset)
2036 {
2037 const struct gen_device_info *devinfo = p->devinfo;
2038
2039 if (devinfo->gen >= 6)
2040 offset /= 16;
2041
2042 if (p->devinfo->gen >= 7) {
2043 /* On gen 7 and above, we no longer have message registers and we can
2044 * send from any register we want. By using the destination register
2045 * for the message, we guarantee that the implied message write won't
2046 * accidentally overwrite anything. This has been a problem because
2047 * the MRF registers and source for the final FB write are both fixed
2048 * and may overlap.
2049 */
2050 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2051 } else {
2052 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2053 }
2054 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2055
2056 const unsigned rlen = num_regs;
2057 const unsigned target_cache =
2058 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2059 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2060 BRW_SFID_DATAPORT_READ);
2061
2062 {
2063 brw_push_insn_state(p);
2064 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2065 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2066 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2067
2068 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2069
2070 /* set message header global offset field (reg 0, element 2) */
2071 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2072 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2073
2074 brw_pop_insn_state(p);
2075 }
2076
2077 {
2078 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2079
2080 brw_inst_set_sfid(devinfo, insn, target_cache);
2081 assert(brw_inst_pred_control(devinfo, insn) == 0);
2082 brw_inst_set_compression(devinfo, insn, false);
2083
2084 brw_set_dest(p, insn, dest); /* UW? */
2085 if (devinfo->gen >= 6) {
2086 brw_set_src0(p, insn, mrf);
2087 } else {
2088 brw_set_src0(p, insn, brw_null_reg());
2089 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2090 }
2091
2092 brw_set_desc(p, insn,
2093 brw_message_desc(devinfo, 1, rlen, true) |
2094 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2095 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2096 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2097 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2098 }
2099 }
2100
2101 void
2102 gen7_block_read_scratch(struct brw_codegen *p,
2103 struct brw_reg dest,
2104 int num_regs,
2105 unsigned offset)
2106 {
2107 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2108 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2109
2110 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2111
2112 /* The HW requires that the header is present; this is to get the g0.5
2113 * scratch offset.
2114 */
2115 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2116
2117 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2118 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2119 * is 32 bytes, which happens to be the size of a register.
2120 */
2121 offset /= REG_SIZE;
2122 assert(offset < (1 << 12));
2123
2124 gen7_set_dp_scratch_message(p, insn,
2125 false, /* scratch read */
2126 false, /* OWords */
2127 false, /* invalidate after read */
2128 num_regs,
2129 offset,
2130 1, /* mlen: just g0 */
2131 num_regs, /* rlen */
2132 true); /* header present */
2133 }
2134
2135 /**
2136 * Read float[4] vectors from the data port constant cache.
2137 * Location (in buffer) should be a multiple of 16.
2138 * Used for fetching shader constants.
2139 */
2140 void brw_oword_block_read(struct brw_codegen *p,
2141 struct brw_reg dest,
2142 struct brw_reg mrf,
2143 uint32_t offset,
2144 uint32_t bind_table_index)
2145 {
2146 const struct gen_device_info *devinfo = p->devinfo;
2147 const unsigned target_cache =
2148 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2149 BRW_SFID_DATAPORT_READ);
2150 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2151
2152 /* On newer hardware, offset is in units of owords. */
2153 if (devinfo->gen >= 6)
2154 offset /= 16;
2155
2156 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2157
2158 brw_push_insn_state(p);
2159 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2160 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2161 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2162
2163 brw_push_insn_state(p);
2164 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2165 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2166
2167 /* set message header global offset field (reg 0, element 2) */
2168 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2169 brw_MOV(p,
2170 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2171 mrf.nr,
2172 2), BRW_REGISTER_TYPE_UD),
2173 brw_imm_ud(offset));
2174 brw_pop_insn_state(p);
2175
2176 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2177
2178 brw_inst_set_sfid(devinfo, insn, target_cache);
2179
2180 /* cast dest to a uword[8] vector */
2181 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2182
2183 brw_set_dest(p, insn, dest);
2184 if (devinfo->gen >= 6) {
2185 brw_set_src0(p, insn, mrf);
2186 } else {
2187 brw_set_src0(p, insn, brw_null_reg());
2188 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2189 }
2190
2191 brw_set_desc(p, insn,
2192 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2193 brw_dp_read_desc(devinfo, bind_table_index,
2194 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2195 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2196 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2197
2198 brw_pop_insn_state(p);
2199 }
2200
2201 brw_inst *
2202 brw_fb_WRITE(struct brw_codegen *p,
2203 struct brw_reg payload,
2204 struct brw_reg implied_header,
2205 unsigned msg_control,
2206 unsigned binding_table_index,
2207 unsigned msg_length,
2208 unsigned response_length,
2209 bool eot,
2210 bool last_render_target,
2211 bool header_present)
2212 {
2213 const struct gen_device_info *devinfo = p->devinfo;
2214 const unsigned target_cache =
2215 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2216 BRW_SFID_DATAPORT_WRITE);
2217 brw_inst *insn;
2218 unsigned msg_type;
2219 struct brw_reg dest, src0;
2220
2221 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2222 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2223 else
2224 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2225
2226 if (devinfo->gen >= 6) {
2227 insn = next_insn(p, BRW_OPCODE_SENDC);
2228 } else {
2229 insn = next_insn(p, BRW_OPCODE_SEND);
2230 }
2231 brw_inst_set_sfid(devinfo, insn, target_cache);
2232 brw_inst_set_compression(devinfo, insn, false);
2233
2234 if (devinfo->gen >= 6) {
2235 /* headerless version, just submit color payload */
2236 src0 = payload;
2237
2238 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2239 } else {
2240 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2241 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2242 src0 = implied_header;
2243
2244 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2245 }
2246
2247 brw_set_dest(p, insn, dest);
2248 brw_set_src0(p, insn, src0);
2249 brw_set_desc(p, insn,
2250 brw_message_desc(devinfo, msg_length, response_length,
2251 header_present) |
2252 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2253 msg_type, last_render_target,
2254 0 /* send_commit_msg */));
2255 brw_inst_set_eot(devinfo, insn, eot);
2256
2257 return insn;
2258 }
2259
2260 brw_inst *
2261 gen9_fb_READ(struct brw_codegen *p,
2262 struct brw_reg dst,
2263 struct brw_reg payload,
2264 unsigned binding_table_index,
2265 unsigned msg_length,
2266 unsigned response_length,
2267 bool per_sample)
2268 {
2269 const struct gen_device_info *devinfo = p->devinfo;
2270 assert(devinfo->gen >= 9);
2271 const unsigned msg_subtype =
2272 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2273 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2274
2275 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2276 brw_set_dest(p, insn, dst);
2277 brw_set_src0(p, insn, payload);
2278 brw_set_desc(
2279 p, insn,
2280 brw_message_desc(devinfo, msg_length, response_length, true) |
2281 brw_dp_read_desc(devinfo, binding_table_index,
2282 per_sample << 5 | msg_subtype,
2283 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2284 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2285 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2286
2287 return insn;
2288 }
2289
2290 /**
2291 * Texture sample instruction.
2292 * Note: the msg_type plus msg_length values determine exactly what kind
2293 * of sampling operation is performed. See volume 4, page 161 of docs.
2294 */
2295 void brw_SAMPLE(struct brw_codegen *p,
2296 struct brw_reg dest,
2297 unsigned msg_reg_nr,
2298 struct brw_reg src0,
2299 unsigned binding_table_index,
2300 unsigned sampler,
2301 unsigned msg_type,
2302 unsigned response_length,
2303 unsigned msg_length,
2304 unsigned header_present,
2305 unsigned simd_mode,
2306 unsigned return_format)
2307 {
2308 const struct gen_device_info *devinfo = p->devinfo;
2309 brw_inst *insn;
2310
2311 if (msg_reg_nr != -1)
2312 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2313
2314 insn = next_insn(p, BRW_OPCODE_SEND);
2315 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2316 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2317
2318 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2319 *
2320 * "Instruction compression is not allowed for this instruction (that
2321 * is, send). The hardware behavior is undefined if this instruction is
2322 * set as compressed. However, compress control can be set to "SecHalf"
2323 * to affect the EMask generation."
2324 *
2325 * No similar wording is found in later PRMs, but there are examples
2326 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2327 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2328 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2329 */
2330 brw_inst_set_compression(devinfo, insn, false);
2331
2332 if (devinfo->gen < 6)
2333 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2334
2335 brw_set_dest(p, insn, dest);
2336 brw_set_src0(p, insn, src0);
2337 brw_set_desc(p, insn,
2338 brw_message_desc(devinfo, msg_length, response_length,
2339 header_present) |
2340 brw_sampler_desc(devinfo, binding_table_index, sampler,
2341 msg_type, simd_mode, return_format));
2342 }
2343
2344 /* Adjust the message header's sampler state pointer to
2345 * select the correct group of 16 samplers.
2346 */
2347 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2348 struct brw_reg header,
2349 struct brw_reg sampler_index)
2350 {
2351 /* The "Sampler Index" field can only store values between 0 and 15.
2352 * However, we can add an offset to the "Sampler State Pointer"
2353 * field, effectively selecting a different set of 16 samplers.
2354 *
2355 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2356 * offset, and each sampler state is only 16-bytes, so we can't
2357 * exclusively use the offset - we have to use both.
2358 */
2359
2360 const struct gen_device_info *devinfo = p->devinfo;
2361
2362 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2363 const int sampler_state_size = 16; /* 16 bytes */
2364 uint32_t sampler = sampler_index.ud;
2365
2366 if (sampler >= 16) {
2367 assert(devinfo->is_haswell || devinfo->gen >= 8);
2368 brw_ADD(p,
2369 get_element_ud(header, 3),
2370 get_element_ud(brw_vec8_grf(0, 0), 3),
2371 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2372 }
2373 } else {
2374 /* Non-const sampler array indexing case */
2375 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2376 return;
2377 }
2378
2379 struct brw_reg temp = get_element_ud(header, 3);
2380
2381 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2382 brw_SHL(p, temp, temp, brw_imm_ud(4));
2383 brw_ADD(p,
2384 get_element_ud(header, 3),
2385 get_element_ud(brw_vec8_grf(0, 0), 3),
2386 temp);
2387 }
2388 }
2389
2390 /* All these variables are pretty confusing - we might be better off
2391 * using bitmasks and macros for this, in the old style. Or perhaps
2392 * just having the caller instantiate the fields in dword3 itself.
2393 */
2394 void brw_urb_WRITE(struct brw_codegen *p,
2395 struct brw_reg dest,
2396 unsigned msg_reg_nr,
2397 struct brw_reg src0,
2398 enum brw_urb_write_flags flags,
2399 unsigned msg_length,
2400 unsigned response_length,
2401 unsigned offset,
2402 unsigned swizzle)
2403 {
2404 const struct gen_device_info *devinfo = p->devinfo;
2405 brw_inst *insn;
2406
2407 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2408
2409 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2410 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2411 brw_push_insn_state(p);
2412 brw_set_default_access_mode(p, BRW_ALIGN_1);
2413 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2414 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2415 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2416 BRW_REGISTER_TYPE_UD),
2417 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2418 brw_imm_ud(0xff00));
2419 brw_pop_insn_state(p);
2420 }
2421
2422 insn = next_insn(p, BRW_OPCODE_SEND);
2423
2424 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2425
2426 brw_set_dest(p, insn, dest);
2427 brw_set_src0(p, insn, src0);
2428 brw_set_src1(p, insn, brw_imm_d(0));
2429
2430 if (devinfo->gen < 6)
2431 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2432
2433 brw_set_urb_message(p,
2434 insn,
2435 flags,
2436 msg_length,
2437 response_length,
2438 offset,
2439 swizzle);
2440 }
2441
2442 void
2443 brw_send_indirect_message(struct brw_codegen *p,
2444 unsigned sfid,
2445 struct brw_reg dst,
2446 struct brw_reg payload,
2447 struct brw_reg desc,
2448 unsigned desc_imm)
2449 {
2450 const struct gen_device_info *devinfo = p->devinfo;
2451 struct brw_inst *send;
2452
2453 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2454
2455 assert(desc.type == BRW_REGISTER_TYPE_UD);
2456
2457 if (desc.file == BRW_IMMEDIATE_VALUE) {
2458 send = next_insn(p, BRW_OPCODE_SEND);
2459 brw_set_desc(p, send, desc.ud | desc_imm);
2460
2461 } else {
2462 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2463
2464 brw_push_insn_state(p);
2465 brw_set_default_access_mode(p, BRW_ALIGN_1);
2466 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2467 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2468 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2469
2470 /* Load the indirect descriptor to an address register using OR so the
2471 * caller can specify additional descriptor bits with the desc_imm
2472 * immediate.
2473 */
2474 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2475
2476 brw_pop_insn_state(p);
2477
2478 send = next_insn(p, BRW_OPCODE_SEND);
2479 brw_set_src1(p, send, addr);
2480 }
2481
2482 brw_set_dest(p, send, dst);
2483 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2484 brw_inst_set_sfid(devinfo, send, sfid);
2485 }
2486
2487 static void
2488 brw_send_indirect_surface_message(struct brw_codegen *p,
2489 unsigned sfid,
2490 struct brw_reg dst,
2491 struct brw_reg payload,
2492 struct brw_reg surface,
2493 unsigned desc_imm)
2494 {
2495 if (surface.file != BRW_IMMEDIATE_VALUE) {
2496 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2497
2498 brw_push_insn_state(p);
2499 brw_set_default_access_mode(p, BRW_ALIGN_1);
2500 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2501 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2502 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2503
2504 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2505 * some surface array is accessed out of bounds.
2506 */
2507 brw_AND(p, addr,
2508 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2509 BRW_GET_SWZ(surface.swizzle, 0)),
2510 brw_imm_ud(0xff));
2511
2512 brw_pop_insn_state(p);
2513
2514 surface = addr;
2515 }
2516
2517 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm);
2518 }
2519
2520 static bool
2521 while_jumps_before_offset(const struct gen_device_info *devinfo,
2522 brw_inst *insn, int while_offset, int start_offset)
2523 {
2524 int scale = 16 / brw_jump_scale(devinfo);
2525 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2526 : brw_inst_jip(devinfo, insn);
2527 assert(jip < 0);
2528 return while_offset + jip * scale <= start_offset;
2529 }
2530
2531
2532 static int
2533 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2534 {
2535 int offset;
2536 void *store = p->store;
2537 const struct gen_device_info *devinfo = p->devinfo;
2538
2539 int depth = 0;
2540
2541 for (offset = next_offset(devinfo, store, start_offset);
2542 offset < p->next_insn_offset;
2543 offset = next_offset(devinfo, store, offset)) {
2544 brw_inst *insn = store + offset;
2545
2546 switch (brw_inst_opcode(devinfo, insn)) {
2547 case BRW_OPCODE_IF:
2548 depth++;
2549 break;
2550 case BRW_OPCODE_ENDIF:
2551 if (depth == 0)
2552 return offset;
2553 depth--;
2554 break;
2555 case BRW_OPCODE_WHILE:
2556 /* If the while doesn't jump before our instruction, it's the end
2557 * of a sibling do...while loop. Ignore it.
2558 */
2559 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2560 continue;
2561 /* fallthrough */
2562 case BRW_OPCODE_ELSE:
2563 case BRW_OPCODE_HALT:
2564 if (depth == 0)
2565 return offset;
2566 }
2567 }
2568
2569 return 0;
2570 }
2571
2572 /* There is no DO instruction on gen6, so to find the end of the loop
2573 * we have to see if the loop is jumping back before our start
2574 * instruction.
2575 */
2576 static int
2577 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2578 {
2579 const struct gen_device_info *devinfo = p->devinfo;
2580 int offset;
2581 void *store = p->store;
2582
2583 assert(devinfo->gen >= 6);
2584
2585 /* Always start after the instruction (such as a WHILE) we're trying to fix
2586 * up.
2587 */
2588 for (offset = next_offset(devinfo, store, start_offset);
2589 offset < p->next_insn_offset;
2590 offset = next_offset(devinfo, store, offset)) {
2591 brw_inst *insn = store + offset;
2592
2593 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2594 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2595 return offset;
2596 }
2597 }
2598 assert(!"not reached");
2599 return start_offset;
2600 }
2601
2602 /* After program generation, go back and update the UIP and JIP of
2603 * BREAK, CONT, and HALT instructions to their correct locations.
2604 */
2605 void
2606 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2607 {
2608 const struct gen_device_info *devinfo = p->devinfo;
2609 int offset;
2610 int br = brw_jump_scale(devinfo);
2611 int scale = 16 / br;
2612 void *store = p->store;
2613
2614 if (devinfo->gen < 6)
2615 return;
2616
2617 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2618 brw_inst *insn = store + offset;
2619 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2620
2621 int block_end_offset = brw_find_next_block_end(p, offset);
2622 switch (brw_inst_opcode(devinfo, insn)) {
2623 case BRW_OPCODE_BREAK:
2624 assert(block_end_offset != 0);
2625 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2626 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2627 brw_inst_set_uip(devinfo, insn,
2628 (brw_find_loop_end(p, offset) - offset +
2629 (devinfo->gen == 6 ? 16 : 0)) / scale);
2630 break;
2631 case BRW_OPCODE_CONTINUE:
2632 assert(block_end_offset != 0);
2633 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2634 brw_inst_set_uip(devinfo, insn,
2635 (brw_find_loop_end(p, offset) - offset) / scale);
2636
2637 assert(brw_inst_uip(devinfo, insn) != 0);
2638 assert(brw_inst_jip(devinfo, insn) != 0);
2639 break;
2640
2641 case BRW_OPCODE_ENDIF: {
2642 int32_t jump = (block_end_offset == 0) ?
2643 1 * br : (block_end_offset - offset) / scale;
2644 if (devinfo->gen >= 7)
2645 brw_inst_set_jip(devinfo, insn, jump);
2646 else
2647 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2648 break;
2649 }
2650
2651 case BRW_OPCODE_HALT:
2652 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2653 *
2654 * "In case of the halt instruction not inside any conditional
2655 * code block, the value of <JIP> and <UIP> should be the
2656 * same. In case of the halt instruction inside conditional code
2657 * block, the <UIP> should be the end of the program, and the
2658 * <JIP> should be end of the most inner conditional code block."
2659 *
2660 * The uip will have already been set by whoever set up the
2661 * instruction.
2662 */
2663 if (block_end_offset == 0) {
2664 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2665 } else {
2666 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2667 }
2668 assert(brw_inst_uip(devinfo, insn) != 0);
2669 assert(brw_inst_jip(devinfo, insn) != 0);
2670 break;
2671 }
2672 }
2673 }
2674
2675 void brw_ff_sync(struct brw_codegen *p,
2676 struct brw_reg dest,
2677 unsigned msg_reg_nr,
2678 struct brw_reg src0,
2679 bool allocate,
2680 unsigned response_length,
2681 bool eot)
2682 {
2683 const struct gen_device_info *devinfo = p->devinfo;
2684 brw_inst *insn;
2685
2686 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2687
2688 insn = next_insn(p, BRW_OPCODE_SEND);
2689 brw_set_dest(p, insn, dest);
2690 brw_set_src0(p, insn, src0);
2691 brw_set_src1(p, insn, brw_imm_d(0));
2692
2693 if (devinfo->gen < 6)
2694 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2695
2696 brw_set_ff_sync_message(p,
2697 insn,
2698 allocate,
2699 response_length,
2700 eot);
2701 }
2702
2703 /**
2704 * Emit the SEND instruction necessary to generate stream output data on Gen6
2705 * (for transform feedback).
2706 *
2707 * If send_commit_msg is true, this is the last piece of stream output data
2708 * from this thread, so send the data as a committed write. According to the
2709 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2710 *
2711 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2712 * writes are complete by sending the final write as a committed write."
2713 */
2714 void
2715 brw_svb_write(struct brw_codegen *p,
2716 struct brw_reg dest,
2717 unsigned msg_reg_nr,
2718 struct brw_reg src0,
2719 unsigned binding_table_index,
2720 bool send_commit_msg)
2721 {
2722 const struct gen_device_info *devinfo = p->devinfo;
2723 const unsigned target_cache =
2724 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2725 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2726 BRW_SFID_DATAPORT_WRITE);
2727 brw_inst *insn;
2728
2729 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2730
2731 insn = next_insn(p, BRW_OPCODE_SEND);
2732 brw_inst_set_sfid(devinfo, insn, target_cache);
2733 brw_set_dest(p, insn, dest);
2734 brw_set_src0(p, insn, src0);
2735 brw_set_desc(p, insn,
2736 brw_message_desc(devinfo, 1, send_commit_msg, true) |
2737 brw_dp_write_desc(devinfo, binding_table_index,
2738 0, /* msg_control: ignored */
2739 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2740 0, /* last_render_target: ignored */
2741 send_commit_msg)); /* send_commit_msg */
2742 }
2743
2744 static unsigned
2745 brw_surface_payload_size(struct brw_codegen *p,
2746 unsigned num_channels,
2747 unsigned exec_size /**< 0 for SIMD4x2 */)
2748 {
2749 if (exec_size == 0)
2750 return 1; /* SIMD4x2 */
2751 else if (exec_size <= 8)
2752 return num_channels;
2753 else
2754 return 2 * num_channels;
2755 }
2756
2757 void
2758 brw_untyped_atomic(struct brw_codegen *p,
2759 struct brw_reg dst,
2760 struct brw_reg payload,
2761 struct brw_reg surface,
2762 unsigned atomic_op,
2763 unsigned msg_length,
2764 bool response_expected,
2765 bool header_present)
2766 {
2767 const struct gen_device_info *devinfo = p->devinfo;
2768 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2769 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2770 GEN7_SFID_DATAPORT_DATA_CACHE);
2771 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2772 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
2773 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
2774 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
2775 has_simd4x2 ? 0 : 8;
2776 const unsigned response_length =
2777 brw_surface_payload_size(p, response_expected, exec_size);
2778 const unsigned desc =
2779 brw_message_desc(devinfo, msg_length, response_length, header_present) |
2780 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
2781 response_expected);
2782 /* Mask out unused components -- This is especially important in Align16
2783 * mode on generations that don't have native support for SIMD4x2 atomics,
2784 * because unused but enabled components will cause the dataport to perform
2785 * additional atomic operations on the addresses that happen to be in the
2786 * uninitialized Y, Z and W coordinates of the payload.
2787 */
2788 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2789
2790 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
2791 payload, surface, desc);
2792 }
2793
2794 void
2795 brw_untyped_surface_read(struct brw_codegen *p,
2796 struct brw_reg dst,
2797 struct brw_reg payload,
2798 struct brw_reg surface,
2799 unsigned msg_length,
2800 unsigned num_channels)
2801 {
2802 const struct gen_device_info *devinfo = p->devinfo;
2803 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2804 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2805 GEN7_SFID_DATAPORT_DATA_CACHE);
2806 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2807 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
2808 const unsigned response_length =
2809 brw_surface_payload_size(p, num_channels, exec_size);
2810 const unsigned desc =
2811 brw_message_desc(devinfo, msg_length, response_length, false) |
2812 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
2813
2814 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
2815 }
2816
2817 void
2818 brw_untyped_surface_write(struct brw_codegen *p,
2819 struct brw_reg payload,
2820 struct brw_reg surface,
2821 unsigned msg_length,
2822 unsigned num_channels,
2823 bool header_present)
2824 {
2825 const struct gen_device_info *devinfo = p->devinfo;
2826 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2827 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2828 GEN7_SFID_DATAPORT_DATA_CACHE);
2829 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2830 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
2831 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
2832 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
2833 has_simd4x2 ? 0 : 8;
2834 const unsigned desc =
2835 brw_message_desc(devinfo, msg_length, 0, header_present) |
2836 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
2837 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2838 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
2839
2840 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
2841 payload, surface, desc);
2842 }
2843
2844 void
2845 brw_typed_atomic(struct brw_codegen *p,
2846 struct brw_reg dst,
2847 struct brw_reg payload,
2848 struct brw_reg surface,
2849 unsigned atomic_op,
2850 unsigned msg_length,
2851 bool response_expected,
2852 bool header_present) {
2853 const struct gen_device_info *devinfo = p->devinfo;
2854 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2855 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2856 GEN6_SFID_DATAPORT_RENDER_CACHE);
2857 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2858 /* SIMD4x2 typed atomic instructions only exist on HSW+ */
2859 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
2860 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
2861 has_simd4x2 ? 0 : 8;
2862 /* Typed atomics don't support SIMD16 */
2863 assert(exec_size <= 8);
2864 const unsigned response_length =
2865 brw_surface_payload_size(p, response_expected, exec_size);
2866 const unsigned desc =
2867 brw_message_desc(devinfo, msg_length, response_length, header_present) |
2868 brw_dp_typed_atomic_desc(devinfo, exec_size, brw_get_default_group(p),
2869 atomic_op, response_expected);
2870 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2871 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2872
2873 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
2874 payload, surface, desc);
2875 }
2876
2877 void
2878 brw_typed_surface_read(struct brw_codegen *p,
2879 struct brw_reg dst,
2880 struct brw_reg payload,
2881 struct brw_reg surface,
2882 unsigned msg_length,
2883 unsigned num_channels,
2884 bool header_present)
2885 {
2886 const struct gen_device_info *devinfo = p->devinfo;
2887 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2888 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2889 GEN6_SFID_DATAPORT_RENDER_CACHE);
2890 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2891 /* SIMD4x2 typed read instructions only exist on HSW+ */
2892 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
2893 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
2894 has_simd4x2 ? 0 : 8;
2895 const unsigned response_length =
2896 brw_surface_payload_size(p, num_channels, exec_size);
2897 const unsigned desc =
2898 brw_message_desc(devinfo, msg_length, response_length, header_present) |
2899 brw_dp_typed_surface_rw_desc(devinfo, exec_size, brw_get_default_group(p),
2900 num_channels, false);
2901
2902 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
2903 }
2904
2905 void
2906 brw_typed_surface_write(struct brw_codegen *p,
2907 struct brw_reg payload,
2908 struct brw_reg surface,
2909 unsigned msg_length,
2910 unsigned num_channels,
2911 bool header_present)
2912 {
2913 const struct gen_device_info *devinfo = p->devinfo;
2914 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2915 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2916 GEN6_SFID_DATAPORT_RENDER_CACHE);
2917 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2918 /* SIMD4x2 typed read instructions only exist on HSW+ */
2919 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
2920 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
2921 has_simd4x2 ? 0 : 8;
2922 const unsigned desc =
2923 brw_message_desc(devinfo, msg_length, 0, header_present) |
2924 brw_dp_typed_surface_rw_desc(devinfo, exec_size, brw_get_default_group(p),
2925 num_channels, true);
2926 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2927 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
2928
2929 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
2930 payload, surface, desc);
2931 }
2932
2933 static void
2934 brw_set_memory_fence_message(struct brw_codegen *p,
2935 struct brw_inst *insn,
2936 enum brw_message_target sfid,
2937 bool commit_enable)
2938 {
2939 const struct gen_device_info *devinfo = p->devinfo;
2940
2941 brw_set_desc(p, insn, brw_message_desc(
2942 devinfo, 1, (commit_enable ? 1 : 0), true));
2943
2944 brw_inst_set_sfid(devinfo, insn, sfid);
2945
2946 switch (sfid) {
2947 case GEN6_SFID_DATAPORT_RENDER_CACHE:
2948 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
2949 break;
2950 case GEN7_SFID_DATAPORT_DATA_CACHE:
2951 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
2952 break;
2953 default:
2954 unreachable("Not reached");
2955 }
2956
2957 if (commit_enable)
2958 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
2959 }
2960
2961 void
2962 brw_memory_fence(struct brw_codegen *p,
2963 struct brw_reg dst,
2964 enum opcode send_op)
2965 {
2966 const struct gen_device_info *devinfo = p->devinfo;
2967 const bool commit_enable =
2968 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
2969 (devinfo->gen == 7 && !devinfo->is_haswell);
2970 struct brw_inst *insn;
2971
2972 brw_push_insn_state(p);
2973 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2974 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2975 dst = vec1(dst);
2976
2977 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
2978 * message doesn't write anything back.
2979 */
2980 insn = next_insn(p, send_op);
2981 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2982 brw_set_dest(p, insn, dst);
2983 brw_set_src0(p, insn, dst);
2984 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2985 commit_enable);
2986
2987 if (devinfo->gen == 7 && !devinfo->is_haswell) {
2988 /* IVB does typed surface access through the render cache, so we need to
2989 * flush it too. Use a different register so both flushes can be
2990 * pipelined by the hardware.
2991 */
2992 insn = next_insn(p, send_op);
2993 brw_set_dest(p, insn, offset(dst, 1));
2994 brw_set_src0(p, insn, offset(dst, 1));
2995 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
2996 commit_enable);
2997
2998 /* Now write the response of the second message into the response of the
2999 * first to trigger a pipeline stall -- This way future render and data
3000 * cache messages will be properly ordered with respect to past data and
3001 * render cache messages.
3002 */
3003 brw_MOV(p, dst, offset(dst, 1));
3004 }
3005
3006 brw_pop_insn_state(p);
3007 }
3008
3009 void
3010 brw_pixel_interpolator_query(struct brw_codegen *p,
3011 struct brw_reg dest,
3012 struct brw_reg mrf,
3013 bool noperspective,
3014 unsigned mode,
3015 struct brw_reg data,
3016 unsigned msg_length,
3017 unsigned response_length)
3018 {
3019 const struct gen_device_info *devinfo = p->devinfo;
3020 const uint16_t exec_size = brw_get_default_exec_size(p);
3021 const unsigned slot_group = brw_get_default_group(p) / 16;
3022 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3023 const unsigned desc =
3024 brw_message_desc(devinfo, msg_length, response_length, false) |
3025 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3026 slot_group);
3027
3028 /* brw_send_indirect_message will automatically use a direct send message
3029 * if data is actually immediate.
3030 */
3031 brw_send_indirect_message(p,
3032 GEN7_SFID_PIXEL_INTERPOLATOR,
3033 dest,
3034 mrf,
3035 vec1(data),
3036 desc);
3037 }
3038
3039 void
3040 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3041 struct brw_reg mask)
3042 {
3043 const struct gen_device_info *devinfo = p->devinfo;
3044 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3045 const unsigned qtr_control = brw_get_default_group(p) / 8;
3046 brw_inst *inst;
3047
3048 assert(devinfo->gen >= 7);
3049 assert(mask.type == BRW_REGISTER_TYPE_UD);
3050
3051 brw_push_insn_state(p);
3052
3053 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3054 * unnecessary bits in the instruction words, get the information we need
3055 * and reset the default flag register. This allows more instructions to be
3056 * compacted.
3057 */
3058 const unsigned flag_subreg = p->current->flag_subreg;
3059 brw_set_default_flag_reg(p, 0, 0);
3060
3061 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3062 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3063
3064 if (devinfo->gen >= 8) {
3065 /* Getting the first active channel index is easy on Gen8: Just find
3066 * the first bit set in the execution mask. The register exists on
3067 * HSW already but it reads back as all ones when the current
3068 * instruction has execution masking disabled, so it's kind of
3069 * useless.
3070 */
3071 struct brw_reg exec_mask =
3072 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3073
3074 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3075 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3076 /* Unfortunately, ce0 does not take into account the thread
3077 * dispatch mask, which may be a problem in cases where it's not
3078 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3079 * some n). Combine ce0 with the given dispatch (or vector) mask
3080 * to mask off those channels which were never dispatched by the
3081 * hardware.
3082 */
3083 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3084 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3085 exec_mask = vec1(dst);
3086 }
3087
3088 /* Quarter control has the effect of magically shifting the value of
3089 * ce0 so you'll get the first active channel relative to the
3090 * specified quarter control as result.
3091 */
3092 inst = brw_FBL(p, vec1(dst), exec_mask);
3093 } else {
3094 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3095
3096 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3097 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3098
3099 /* Run enough instructions returning zero with execution masking and
3100 * a conditional modifier enabled in order to get the full execution
3101 * mask in f1.0. We could use a single 32-wide move here if it
3102 * weren't because of the hardware bug that causes channel enables to
3103 * be applied incorrectly to the second half of 32-wide instructions
3104 * on Gen7.
3105 */
3106 const unsigned lower_size = MIN2(16, exec_size);
3107 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3108 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3109 brw_imm_uw(0));
3110 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3111 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3112 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3113 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3114 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3115 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3116 }
3117
3118 /* Find the first bit set in the exec_size-wide portion of the flag
3119 * register that was updated by the last sequence of MOV
3120 * instructions.
3121 */
3122 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3123 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3124 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3125 }
3126 } else {
3127 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3128
3129 if (devinfo->gen >= 8 &&
3130 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3131 /* In SIMD4x2 mode the first active channel index is just the
3132 * negation of the first bit of the mask register. Note that ce0
3133 * doesn't take into account the dispatch mask, so the Gen7 path
3134 * should be used instead unless you have the guarantee that the
3135 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3136 * for some n).
3137 */
3138 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3139 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3140 brw_imm_ud(1));
3141
3142 } else {
3143 /* Overwrite the destination without and with execution masking to
3144 * find out which of the channels is active.
3145 */
3146 brw_push_insn_state(p);
3147 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3148 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3149 brw_imm_ud(1));
3150
3151 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3152 brw_imm_ud(0));
3153 brw_pop_insn_state(p);
3154 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3155 }
3156 }
3157
3158 brw_pop_insn_state(p);
3159 }
3160
3161 void
3162 brw_broadcast(struct brw_codegen *p,
3163 struct brw_reg dst,
3164 struct brw_reg src,
3165 struct brw_reg idx)
3166 {
3167 const struct gen_device_info *devinfo = p->devinfo;
3168 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3169 brw_inst *inst;
3170
3171 brw_push_insn_state(p);
3172 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3173 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3174
3175 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3176 src.address_mode == BRW_ADDRESS_DIRECT);
3177 assert(!src.abs && !src.negate);
3178 assert(src.type == dst.type);
3179
3180 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3181 idx.file == BRW_IMMEDIATE_VALUE) {
3182 /* Trivial, the source is already uniform or the index is a constant.
3183 * We will typically not get here if the optimizer is doing its job, but
3184 * asserting would be mean.
3185 */
3186 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3187 brw_MOV(p, dst,
3188 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3189 stride(suboffset(src, 4 * i), 0, 4, 1)));
3190 } else {
3191 /* From the Haswell PRM section "Register Region Restrictions":
3192 *
3193 * "The lower bits of the AddressImmediate must not overflow to
3194 * change the register address. The lower 5 bits of Address
3195 * Immediate when added to lower 5 bits of address register gives
3196 * the sub-register offset. The upper bits of Address Immediate
3197 * when added to upper bits of address register gives the register
3198 * address. Any overflow from sub-register offset is dropped."
3199 *
3200 * Fortunately, for broadcast, we never have a sub-register offset so
3201 * this isn't an issue.
3202 */
3203 assert(src.subnr == 0);
3204
3205 if (align1) {
3206 const struct brw_reg addr =
3207 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3208 unsigned offset = src.nr * REG_SIZE + src.subnr;
3209 /* Limit in bytes of the signed indirect addressing immediate. */
3210 const unsigned limit = 512;
3211
3212 brw_push_insn_state(p);
3213 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3214 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3215
3216 /* Take into account the component size and horizontal stride. */
3217 assert(src.vstride == src.hstride + src.width);
3218 brw_SHL(p, addr, vec1(idx),
3219 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3220 src.hstride - 1));
3221
3222 /* We can only address up to limit bytes using the indirect
3223 * addressing immediate, account for the difference if the source
3224 * register is above this limit.
3225 */
3226 if (offset >= limit) {
3227 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3228 offset = offset % limit;
3229 }
3230
3231 brw_pop_insn_state(p);
3232
3233 /* Use indirect addressing to fetch the specified component. */
3234 if (type_sz(src.type) > 4 &&
3235 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3236 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3237 *
3238 * "When source or destination datatype is 64b or operation is
3239 * integer DWord multiply, indirect addressing must not be
3240 * used."
3241 *
3242 * To work around both of this issue, we do two integer MOVs
3243 * insead of one 64-bit MOV. Because no double value should ever
3244 * cross a register boundary, it's safe to use the immediate
3245 * offset in the indirect here to handle adding 4 bytes to the
3246 * offset and avoid the extra ADD to the register file.
3247 */
3248 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3249 retype(brw_vec1_indirect(addr.subnr, offset),
3250 BRW_REGISTER_TYPE_D));
3251 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3252 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3253 BRW_REGISTER_TYPE_D));
3254 } else {
3255 brw_MOV(p, dst,
3256 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3257 }
3258 } else {
3259 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3260 * to all bits of a flag register,
3261 */
3262 inst = brw_MOV(p,
3263 brw_null_reg(),
3264 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3265 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3266 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3267 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3268
3269 /* and use predicated SEL to pick the right channel. */
3270 inst = brw_SEL(p, dst,
3271 stride(suboffset(src, 4), 4, 4, 1),
3272 stride(src, 4, 4, 1));
3273 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3274 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3275 }
3276 }
3277
3278 brw_pop_insn_state(p);
3279 }
3280
3281 /**
3282 * This instruction is generated as a single-channel align1 instruction by
3283 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3284 *
3285 * We can't use the typed atomic op in the FS because that has the execution
3286 * mask ANDed with the pixel mask, but we just want to write the one dword for
3287 * all the pixels.
3288 *
3289 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3290 * one u32. So we use the same untyped atomic write message as the pixel
3291 * shader.
3292 *
3293 * The untyped atomic operation requires a BUFFER surface type with RAW
3294 * format, and is only accessible through the legacy DATA_CACHE dataport
3295 * messages.
3296 */
3297 void brw_shader_time_add(struct brw_codegen *p,
3298 struct brw_reg payload,
3299 uint32_t surf_index)
3300 {
3301 const struct gen_device_info *devinfo = p->devinfo;
3302 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3303 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3304 GEN7_SFID_DATAPORT_DATA_CACHE);
3305 assert(devinfo->gen >= 7);
3306
3307 brw_push_insn_state(p);
3308 brw_set_default_access_mode(p, BRW_ALIGN_1);
3309 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3310 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3311 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3312
3313 /* We use brw_vec1_reg and unmasked because we want to increment the given
3314 * offset only once.
3315 */
3316 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3317 BRW_ARF_NULL, 0));
3318 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3319 payload.nr, 0));
3320 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3321 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3322 false)));
3323
3324 brw_inst_set_sfid(devinfo, send, sfid);
3325 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3326
3327 brw_pop_insn_state(p);
3328 }
3329
3330
3331 /**
3332 * Emit the SEND message for a barrier
3333 */
3334 void
3335 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3336 {
3337 const struct gen_device_info *devinfo = p->devinfo;
3338 struct brw_inst *inst;
3339
3340 assert(devinfo->gen >= 7);
3341
3342 brw_push_insn_state(p);
3343 brw_set_default_access_mode(p, BRW_ALIGN_1);
3344 inst = next_insn(p, BRW_OPCODE_SEND);
3345 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3346 brw_set_src0(p, inst, src);
3347 brw_set_src1(p, inst, brw_null_reg());
3348 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3349
3350 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3351 brw_inst_set_gateway_notify(devinfo, inst, 1);
3352 brw_inst_set_gateway_subfuncid(devinfo, inst,
3353 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3354
3355 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3356 brw_pop_insn_state(p);
3357 }
3358
3359
3360 /**
3361 * Emit the wait instruction for a barrier
3362 */
3363 void
3364 brw_WAIT(struct brw_codegen *p)
3365 {
3366 const struct gen_device_info *devinfo = p->devinfo;
3367 struct brw_inst *insn;
3368
3369 struct brw_reg src = brw_notification_reg();
3370
3371 insn = next_insn(p, BRW_OPCODE_WAIT);
3372 brw_set_dest(p, insn, src);
3373 brw_set_src0(p, insn, src);
3374 brw_set_src1(p, insn, brw_null_reg());
3375
3376 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3377 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3378 }
3379
3380 /**
3381 * Changes the floating point rounding mode updating the control register
3382 * field defined at cr0.0[5-6] bits. This function supports the changes to
3383 * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3384 * Only RTNE and RTZ rounding are enabled at nir.
3385 */
3386 void
3387 brw_rounding_mode(struct brw_codegen *p,
3388 enum brw_rnd_mode mode)
3389 {
3390 const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
3391
3392 if (bits != BRW_CR0_RND_MODE_MASK) {
3393 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3394 brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
3395 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3396
3397 /* From the Skylake PRM, Volume 7, page 760:
3398 * "Implementation Restriction on Register Access: When the control
3399 * register is used as an explicit source and/or destination, hardware
3400 * does not ensure execution pipeline coherency. Software must set the
3401 * thread control field to ‘switch’ for an instruction that uses
3402 * control register as an explicit operand."
3403 */
3404 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3405 }
3406
3407 if (bits) {
3408 brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3409 brw_imm_ud(bits));
3410 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3411 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3412 }
3413 }