i965: Extract functions dealing with register types to separate file
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 gen7_convert_mrf_to_grf(p, &dest);
98
99 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
100 brw_inst_set_dst_reg_type(devinfo, inst,
101 brw_reg_type_to_hw_type(devinfo, dest.file,
102 dest.type));
103 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
104
105 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
106 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
107
108 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
109 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
110 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
111 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
112 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
113 } else {
114 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
115 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
116 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
117 dest.file == BRW_MESSAGE_REGISTER_FILE) {
118 assert(dest.writemask != 0);
119 }
120 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
121 * Although Dst.HorzStride is a don't care for Align16, HW needs
122 * this to be programmed as "01".
123 */
124 brw_inst_set_dst_hstride(devinfo, inst, 1);
125 }
126 } else {
127 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
128
129 /* These are different sizes in align1 vs align16:
130 */
131 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
132 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
133 dest.indirect_offset);
134 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
135 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
136 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
137 } else {
138 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
139 dest.indirect_offset);
140 /* even ignored in da16, still need to set as '01' */
141 brw_inst_set_dst_hstride(devinfo, inst, 1);
142 }
143 }
144
145 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
146 * or 16 (SIMD16), as that's normally correct. However, when dealing with
147 * small registers, we automatically reduce it to match the register size.
148 *
149 * In platforms that support fp64 we can emit instructions with a width of
150 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
151 * cases we need to make sure that these instructions have their exec sizes
152 * set properly when they are emitted and we can't rely on this code to fix
153 * it.
154 */
155 bool fix_exec_size;
156 if (devinfo->gen >= 6)
157 fix_exec_size = dest.width < BRW_EXECUTE_4;
158 else
159 fix_exec_size = dest.width < BRW_EXECUTE_8;
160
161 if (fix_exec_size)
162 brw_inst_set_exec_size(devinfo, inst, dest.width);
163 }
164
165 static void
166 validate_reg(const struct gen_device_info *devinfo,
167 brw_inst *inst, struct brw_reg reg)
168 {
169 const int hstride_for_reg[] = {0, 1, 2, 4};
170 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
171 const int width_for_reg[] = {1, 2, 4, 8, 16};
172 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
173 int width, hstride, vstride, execsize;
174
175 if (reg.file == BRW_IMMEDIATE_VALUE)
176 return;
177
178 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
179 reg.file == BRW_ARF_NULL)
180 return;
181
182 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
183 *
184 * "Swizzling is not allowed when an accumulator is used as an implicit
185 * source or an explicit source in an instruction."
186 */
187 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
188 reg.nr == BRW_ARF_ACCUMULATOR)
189 assert(reg.swizzle == BRW_SWIZZLE_XYZW);
190
191 assert(reg.hstride < ARRAY_SIZE(hstride_for_reg));
192 hstride = hstride_for_reg[reg.hstride];
193
194 if (reg.vstride == 0xf) {
195 vstride = -1;
196 } else {
197 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
198 vstride = vstride_for_reg[reg.vstride];
199 }
200
201 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
202 width = width_for_reg[reg.width];
203
204 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
205 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
206 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
207
208 /* Restrictions from 3.3.10: Register Region Restrictions. */
209 /* 3. */
210 assert(execsize >= width);
211
212 /* 4. */
213 if (execsize == width && hstride != 0) {
214 assert(vstride == -1 || vstride == width * hstride);
215 }
216
217 /* 5. */
218 if (execsize == width && hstride == 0) {
219 /* no restriction on vstride. */
220 }
221
222 /* 6. */
223 if (width == 1) {
224 assert(hstride == 0);
225 }
226
227 /* 7. */
228 if (execsize == 1 && width == 1) {
229 assert(hstride == 0);
230 assert(vstride == 0);
231 }
232
233 /* 8. */
234 if (vstride == 0 && hstride == 0) {
235 assert(width == 1);
236 }
237
238 /* 10. Check destination issues. */
239 }
240
241 void
242 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
243 {
244 const struct gen_device_info *devinfo = p->devinfo;
245
246 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
247 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
248 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
249 assert(reg.nr < 128);
250
251 gen7_convert_mrf_to_grf(p, &reg);
252
253 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
254 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
255 /* Any source modifiers or regions will be ignored, since this just
256 * identifies the MRF/GRF to start reading the message contents from.
257 * Check for some likely failures.
258 */
259 assert(!reg.negate);
260 assert(!reg.abs);
261 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
262 }
263
264 validate_reg(devinfo, inst, reg);
265
266 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
267 brw_inst_set_src0_reg_type(devinfo, inst,
268 brw_reg_type_to_hw_type(devinfo, reg.file, reg.type));
269 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
270 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
271 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
272
273 if (reg.file == BRW_IMMEDIATE_VALUE) {
274 if (reg.type == BRW_REGISTER_TYPE_DF ||
275 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
276 brw_inst_set_imm_df(devinfo, inst, reg.df);
277 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
278 reg.type == BRW_REGISTER_TYPE_Q)
279 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
280 else
281 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
282
283 if (type_sz(reg.type) < 8) {
284 brw_inst_set_src1_reg_file(devinfo, inst,
285 BRW_ARCHITECTURE_REGISTER_FILE);
286 brw_inst_set_src1_reg_type(devinfo, inst,
287 brw_inst_src0_reg_type(devinfo, inst));
288 }
289 } else {
290 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
291 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
292 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
293 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
294 } else {
295 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
296 }
297 } else {
298 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
299
300 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
301 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
302 } else {
303 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
304 }
305 }
306
307 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
308 if (reg.width == BRW_WIDTH_1 &&
309 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
310 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
311 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
312 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
313 } else {
314 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
315 brw_inst_set_src0_width(devinfo, inst, reg.width);
316 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
317 }
318 } else {
319 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
320 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
321 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
322 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
323 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
324 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
325 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
326 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
327
328 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
329 /* This is an oddity of the fact we're using the same
330 * descriptions for registers in align_16 as align_1:
331 */
332 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
333 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
334 reg.type == BRW_REGISTER_TYPE_DF &&
335 reg.vstride == BRW_VERTICAL_STRIDE_2) {
336 /* From SNB PRM:
337 *
338 * "For Align16 access mode, only encodings of 0000 and 0011
339 * are allowed. Other codes are reserved."
340 *
341 * Presumably the DevSNB behavior applies to IVB as well.
342 */
343 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
344 } else {
345 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
346 }
347 }
348 }
349 }
350
351
352 void
353 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
354 {
355 const struct gen_device_info *devinfo = p->devinfo;
356
357 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
358 assert(reg.nr < 128);
359
360 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
361 *
362 * "Accumulator registers may be accessed explicitly as src0
363 * operands only."
364 */
365 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
366 reg.nr != BRW_ARF_ACCUMULATOR);
367
368 gen7_convert_mrf_to_grf(p, &reg);
369 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
370
371 validate_reg(devinfo, inst, reg);
372
373 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
374 brw_inst_set_src1_reg_type(devinfo, inst,
375 brw_reg_type_to_hw_type(devinfo, reg.file, reg.type));
376 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
377 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
378
379 /* Only src1 can be immediate in two-argument instructions.
380 */
381 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
382
383 if (reg.file == BRW_IMMEDIATE_VALUE) {
384 /* two-argument instructions can only use 32-bit immediates */
385 assert(type_sz(reg.type) < 8);
386 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
387 } else {
388 /* This is a hardware restriction, which may or may not be lifted
389 * in the future:
390 */
391 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
392 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
393
394 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
395 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
396 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
397 } else {
398 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
399 }
400
401 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
402 if (reg.width == BRW_WIDTH_1 &&
403 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
404 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
405 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
406 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
407 } else {
408 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
409 brw_inst_set_src1_width(devinfo, inst, reg.width);
410 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
411 }
412 } else {
413 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
414 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
415 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
416 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
417 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
418 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
419 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
420 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
421
422 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
423 /* This is an oddity of the fact we're using the same
424 * descriptions for registers in align_16 as align_1:
425 */
426 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
427 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
428 reg.type == BRW_REGISTER_TYPE_DF &&
429 reg.vstride == BRW_VERTICAL_STRIDE_2) {
430 /* From SNB PRM:
431 *
432 * "For Align16 access mode, only encodings of 0000 and 0011
433 * are allowed. Other codes are reserved."
434 *
435 * Presumably the DevSNB behavior applies to IVB as well.
436 */
437 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
438 } else {
439 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
440 }
441 }
442 }
443 }
444
445 /**
446 * Set the Message Descriptor and Extended Message Descriptor fields
447 * for SEND messages.
448 *
449 * \note This zeroes out the Function Control bits, so it must be called
450 * \b before filling out any message-specific data. Callers can
451 * choose not to fill in irrelevant bits; they will be zero.
452 */
453 void
454 brw_set_message_descriptor(struct brw_codegen *p,
455 brw_inst *inst,
456 enum brw_message_target sfid,
457 unsigned msg_length,
458 unsigned response_length,
459 bool header_present,
460 bool end_of_thread)
461 {
462 const struct gen_device_info *devinfo = p->devinfo;
463
464 brw_set_src1(p, inst, brw_imm_d(0));
465
466 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
467 * itself; instead, it will be a MOV/OR into the address register.
468 *
469 * In this case, we avoid setting the extended message descriptor bits,
470 * since they go on the later SEND/SENDC instead and if set here would
471 * instead clobber the conditionalmod bits.
472 */
473 unsigned opcode = brw_inst_opcode(devinfo, inst);
474 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
475 brw_inst_set_sfid(devinfo, inst, sfid);
476 }
477
478 brw_inst_set_mlen(devinfo, inst, msg_length);
479 brw_inst_set_rlen(devinfo, inst, response_length);
480 brw_inst_set_eot(devinfo, inst, end_of_thread);
481
482 if (devinfo->gen >= 5) {
483 brw_inst_set_header_present(devinfo, inst, header_present);
484 }
485 }
486
487 static void brw_set_math_message( struct brw_codegen *p,
488 brw_inst *inst,
489 unsigned function,
490 unsigned integer_type,
491 bool low_precision,
492 unsigned dataType )
493 {
494 const struct gen_device_info *devinfo = p->devinfo;
495 unsigned msg_length;
496 unsigned response_length;
497
498 /* Infer message length from the function */
499 switch (function) {
500 case BRW_MATH_FUNCTION_POW:
501 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
502 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
503 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
504 msg_length = 2;
505 break;
506 default:
507 msg_length = 1;
508 break;
509 }
510
511 /* Infer response length from the function */
512 switch (function) {
513 case BRW_MATH_FUNCTION_SINCOS:
514 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
515 response_length = 2;
516 break;
517 default:
518 response_length = 1;
519 break;
520 }
521
522
523 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
524 msg_length, response_length, false, false);
525 brw_inst_set_math_msg_function(devinfo, inst, function);
526 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
527 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
528 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
529 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
530 brw_inst_set_saturate(devinfo, inst, 0);
531 }
532
533
534 static void brw_set_ff_sync_message(struct brw_codegen *p,
535 brw_inst *insn,
536 bool allocate,
537 unsigned response_length,
538 bool end_of_thread)
539 {
540 const struct gen_device_info *devinfo = p->devinfo;
541
542 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
543 1, response_length, true, end_of_thread);
544 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
545 brw_inst_set_urb_allocate(devinfo, insn, allocate);
546 /* The following fields are not used by FF_SYNC: */
547 brw_inst_set_urb_global_offset(devinfo, insn, 0);
548 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
549 brw_inst_set_urb_used(devinfo, insn, 0);
550 brw_inst_set_urb_complete(devinfo, insn, 0);
551 }
552
553 static void brw_set_urb_message( struct brw_codegen *p,
554 brw_inst *insn,
555 enum brw_urb_write_flags flags,
556 unsigned msg_length,
557 unsigned response_length,
558 unsigned offset,
559 unsigned swizzle_control )
560 {
561 const struct gen_device_info *devinfo = p->devinfo;
562
563 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
564 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
565 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
566
567 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
568 msg_length, response_length, true,
569 flags & BRW_URB_WRITE_EOT);
570
571 if (flags & BRW_URB_WRITE_OWORD) {
572 assert(msg_length == 2); /* header + one OWORD of data */
573 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
574 } else {
575 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
576 }
577
578 brw_inst_set_urb_global_offset(devinfo, insn, offset);
579 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
580
581 if (devinfo->gen < 8) {
582 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
583 }
584
585 if (devinfo->gen < 7) {
586 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
587 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
588 } else {
589 brw_inst_set_urb_per_slot_offset(devinfo, insn,
590 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
591 }
592 }
593
594 void
595 brw_set_dp_write_message(struct brw_codegen *p,
596 brw_inst *insn,
597 unsigned binding_table_index,
598 unsigned msg_control,
599 unsigned msg_type,
600 unsigned target_cache,
601 unsigned msg_length,
602 bool header_present,
603 unsigned last_render_target,
604 unsigned response_length,
605 unsigned end_of_thread,
606 unsigned send_commit_msg)
607 {
608 const struct gen_device_info *devinfo = p->devinfo;
609 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
610 BRW_SFID_DATAPORT_WRITE);
611
612 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
613 header_present, end_of_thread);
614
615 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
616 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
617 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
618 brw_inst_set_rt_last(devinfo, insn, last_render_target);
619 if (devinfo->gen < 7) {
620 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
621 }
622 }
623
624 void
625 brw_set_dp_read_message(struct brw_codegen *p,
626 brw_inst *insn,
627 unsigned binding_table_index,
628 unsigned msg_control,
629 unsigned msg_type,
630 unsigned target_cache,
631 unsigned msg_length,
632 bool header_present,
633 unsigned response_length)
634 {
635 const struct gen_device_info *devinfo = p->devinfo;
636 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
637 BRW_SFID_DATAPORT_READ);
638
639 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
640 header_present, false);
641
642 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
643 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
644 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
645 if (devinfo->gen < 6)
646 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
647 }
648
649 void
650 brw_set_sampler_message(struct brw_codegen *p,
651 brw_inst *inst,
652 unsigned binding_table_index,
653 unsigned sampler,
654 unsigned msg_type,
655 unsigned response_length,
656 unsigned msg_length,
657 unsigned header_present,
658 unsigned simd_mode,
659 unsigned return_format)
660 {
661 const struct gen_device_info *devinfo = p->devinfo;
662
663 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
664 response_length, header_present, false);
665
666 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
667 brw_inst_set_sampler(devinfo, inst, sampler);
668 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
669 if (devinfo->gen >= 5) {
670 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
671 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
672 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
673 }
674 }
675
676 static void
677 gen7_set_dp_scratch_message(struct brw_codegen *p,
678 brw_inst *inst,
679 bool write,
680 bool dword,
681 bool invalidate_after_read,
682 unsigned num_regs,
683 unsigned addr_offset,
684 unsigned mlen,
685 unsigned rlen,
686 bool header_present)
687 {
688 const struct gen_device_info *devinfo = p->devinfo;
689 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
690 (devinfo->gen >= 8 && num_regs == 8));
691 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
692 num_regs - 1);
693
694 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
695 mlen, rlen, header_present, false);
696 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
697 brw_inst_set_scratch_read_write(devinfo, inst, write);
698 brw_inst_set_scratch_type(devinfo, inst, dword);
699 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
700 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
701 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
702 }
703
704 #define next_insn brw_next_insn
705 brw_inst *
706 brw_next_insn(struct brw_codegen *p, unsigned opcode)
707 {
708 const struct gen_device_info *devinfo = p->devinfo;
709 brw_inst *insn;
710
711 if (p->nr_insn + 1 > p->store_size) {
712 p->store_size <<= 1;
713 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
714 }
715
716 p->next_insn_offset += 16;
717 insn = &p->store[p->nr_insn++];
718 memcpy(insn, p->current, sizeof(*insn));
719
720 brw_inst_set_opcode(devinfo, insn, opcode);
721 return insn;
722 }
723
724 static brw_inst *
725 brw_alu1(struct brw_codegen *p, unsigned opcode,
726 struct brw_reg dest, struct brw_reg src)
727 {
728 brw_inst *insn = next_insn(p, opcode);
729 brw_set_dest(p, insn, dest);
730 brw_set_src0(p, insn, src);
731 return insn;
732 }
733
734 static brw_inst *
735 brw_alu2(struct brw_codegen *p, unsigned opcode,
736 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
737 {
738 /* 64-bit immediates are only supported on 1-src instructions */
739 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
740 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
741
742 brw_inst *insn = next_insn(p, opcode);
743 brw_set_dest(p, insn, dest);
744 brw_set_src0(p, insn, src0);
745 brw_set_src1(p, insn, src1);
746 return insn;
747 }
748
749 static int
750 get_3src_subreg_nr(struct brw_reg reg)
751 {
752 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
753 * use 32-bit units (components 0..7). Since they only support F/D/UD
754 * types, this doesn't lose any flexibility, but uses fewer bits.
755 */
756 return reg.subnr / 4;
757 }
758
759 static brw_inst *
760 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
761 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
762 {
763 const struct gen_device_info *devinfo = p->devinfo;
764 brw_inst *inst = next_insn(p, opcode);
765
766 gen7_convert_mrf_to_grf(p, &dest);
767
768 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
769
770 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
771 dest.file == BRW_MESSAGE_REGISTER_FILE);
772 assert(dest.nr < 128);
773 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
774 assert(dest.type == BRW_REGISTER_TYPE_F ||
775 dest.type == BRW_REGISTER_TYPE_DF ||
776 dest.type == BRW_REGISTER_TYPE_D ||
777 dest.type == BRW_REGISTER_TYPE_UD);
778 if (devinfo->gen == 6) {
779 brw_inst_set_3src_dst_reg_file(devinfo, inst,
780 dest.file == BRW_MESSAGE_REGISTER_FILE);
781 }
782 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
783 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
784 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
785
786 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
787 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
788 assert(src0.nr < 128);
789 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
790 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
791 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
792 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
793 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
794 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
795 src0.vstride == BRW_VERTICAL_STRIDE_0);
796
797 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
798 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
799 assert(src1.nr < 128);
800 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
801 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
802 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
803 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
804 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
805 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
806 src1.vstride == BRW_VERTICAL_STRIDE_0);
807
808 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
809 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
810 assert(src2.nr < 128);
811 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
812 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
813 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
814 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
815 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
816 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
817 src2.vstride == BRW_VERTICAL_STRIDE_0);
818
819 if (devinfo->gen >= 7) {
820 /* Set both the source and destination types based on dest.type,
821 * ignoring the source register types. The MAD and LRP emitters ensure
822 * that all four types are float. The BFE and BFI2 emitters, however,
823 * may send us mixed D and UD types and want us to ignore that and use
824 * the destination type.
825 */
826 switch (dest.type) {
827 case BRW_REGISTER_TYPE_F:
828 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
829 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
830 break;
831 case BRW_REGISTER_TYPE_DF:
832 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
833 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
834 break;
835 case BRW_REGISTER_TYPE_D:
836 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
837 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
838 break;
839 case BRW_REGISTER_TYPE_UD:
840 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
841 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
842 break;
843 default:
844 unreachable("not reached");
845 }
846 }
847
848 return inst;
849 }
850
851
852 /***********************************************************************
853 * Convenience routines.
854 */
855 #define ALU1(OP) \
856 brw_inst *brw_##OP(struct brw_codegen *p, \
857 struct brw_reg dest, \
858 struct brw_reg src0) \
859 { \
860 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
861 }
862
863 #define ALU2(OP) \
864 brw_inst *brw_##OP(struct brw_codegen *p, \
865 struct brw_reg dest, \
866 struct brw_reg src0, \
867 struct brw_reg src1) \
868 { \
869 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
870 }
871
872 #define ALU3(OP) \
873 brw_inst *brw_##OP(struct brw_codegen *p, \
874 struct brw_reg dest, \
875 struct brw_reg src0, \
876 struct brw_reg src1, \
877 struct brw_reg src2) \
878 { \
879 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
880 }
881
882 #define ALU3F(OP) \
883 brw_inst *brw_##OP(struct brw_codegen *p, \
884 struct brw_reg dest, \
885 struct brw_reg src0, \
886 struct brw_reg src1, \
887 struct brw_reg src2) \
888 { \
889 assert(dest.type == BRW_REGISTER_TYPE_F || \
890 dest.type == BRW_REGISTER_TYPE_DF); \
891 if (dest.type == BRW_REGISTER_TYPE_F) { \
892 assert(src0.type == BRW_REGISTER_TYPE_F); \
893 assert(src1.type == BRW_REGISTER_TYPE_F); \
894 assert(src2.type == BRW_REGISTER_TYPE_F); \
895 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
896 assert(src0.type == BRW_REGISTER_TYPE_DF); \
897 assert(src1.type == BRW_REGISTER_TYPE_DF); \
898 assert(src2.type == BRW_REGISTER_TYPE_DF); \
899 } \
900 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
901 }
902
903 /* Rounding operations (other than RNDD) require two instructions - the first
904 * stores a rounded value (possibly the wrong way) in the dest register, but
905 * also sets a per-channel "increment bit" in the flag register. A predicated
906 * add of 1.0 fixes dest to contain the desired result.
907 *
908 * Sandybridge and later appear to round correctly without an ADD.
909 */
910 #define ROUND(OP) \
911 void brw_##OP(struct brw_codegen *p, \
912 struct brw_reg dest, \
913 struct brw_reg src) \
914 { \
915 const struct gen_device_info *devinfo = p->devinfo; \
916 brw_inst *rnd, *add; \
917 rnd = next_insn(p, BRW_OPCODE_##OP); \
918 brw_set_dest(p, rnd, dest); \
919 brw_set_src0(p, rnd, src); \
920 \
921 if (devinfo->gen < 6) { \
922 /* turn on round-increments */ \
923 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
924 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
925 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
926 } \
927 }
928
929
930 ALU2(SEL)
931 ALU1(NOT)
932 ALU2(AND)
933 ALU2(OR)
934 ALU2(XOR)
935 ALU2(SHR)
936 ALU2(SHL)
937 ALU1(DIM)
938 ALU2(ASR)
939 ALU1(FRC)
940 ALU1(RNDD)
941 ALU2(MAC)
942 ALU2(MACH)
943 ALU1(LZD)
944 ALU2(DP4)
945 ALU2(DPH)
946 ALU2(DP3)
947 ALU2(DP2)
948 ALU3F(MAD)
949 ALU3F(LRP)
950 ALU1(BFREV)
951 ALU3(BFE)
952 ALU2(BFI1)
953 ALU3(BFI2)
954 ALU1(FBH)
955 ALU1(FBL)
956 ALU1(CBIT)
957 ALU2(ADDC)
958 ALU2(SUBB)
959
960 ROUND(RNDZ)
961 ROUND(RNDE)
962
963 brw_inst *
964 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
965 {
966 const struct gen_device_info *devinfo = p->devinfo;
967
968 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
969 * To avoid the problems that causes, we use a <1,2,0> source region to read
970 * each element twice.
971 */
972 if (devinfo->gen == 7 && !devinfo->is_haswell &&
973 brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1 &&
974 dest.type == BRW_REGISTER_TYPE_DF &&
975 (src0.type == BRW_REGISTER_TYPE_F ||
976 src0.type == BRW_REGISTER_TYPE_D ||
977 src0.type == BRW_REGISTER_TYPE_UD) &&
978 !has_scalar_region(src0)) {
979 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
980 src0.width == BRW_WIDTH_4 &&
981 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
982
983 src0.vstride = BRW_VERTICAL_STRIDE_1;
984 src0.width = BRW_WIDTH_2;
985 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
986 }
987
988 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
989 }
990
991 brw_inst *
992 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
993 struct brw_reg src0, struct brw_reg src1)
994 {
995 /* 6.2.2: add */
996 if (src0.type == BRW_REGISTER_TYPE_F ||
997 (src0.file == BRW_IMMEDIATE_VALUE &&
998 src0.type == BRW_REGISTER_TYPE_VF)) {
999 assert(src1.type != BRW_REGISTER_TYPE_UD);
1000 assert(src1.type != BRW_REGISTER_TYPE_D);
1001 }
1002
1003 if (src1.type == BRW_REGISTER_TYPE_F ||
1004 (src1.file == BRW_IMMEDIATE_VALUE &&
1005 src1.type == BRW_REGISTER_TYPE_VF)) {
1006 assert(src0.type != BRW_REGISTER_TYPE_UD);
1007 assert(src0.type != BRW_REGISTER_TYPE_D);
1008 }
1009
1010 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1011 }
1012
1013 brw_inst *
1014 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1015 struct brw_reg src0, struct brw_reg src1)
1016 {
1017 assert(dest.type == src0.type);
1018 assert(src0.type == src1.type);
1019 switch (src0.type) {
1020 case BRW_REGISTER_TYPE_B:
1021 case BRW_REGISTER_TYPE_UB:
1022 case BRW_REGISTER_TYPE_W:
1023 case BRW_REGISTER_TYPE_UW:
1024 case BRW_REGISTER_TYPE_D:
1025 case BRW_REGISTER_TYPE_UD:
1026 break;
1027 default:
1028 unreachable("Bad type for brw_AVG");
1029 }
1030
1031 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1032 }
1033
1034 brw_inst *
1035 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1036 struct brw_reg src0, struct brw_reg src1)
1037 {
1038 /* 6.32.38: mul */
1039 if (src0.type == BRW_REGISTER_TYPE_D ||
1040 src0.type == BRW_REGISTER_TYPE_UD ||
1041 src1.type == BRW_REGISTER_TYPE_D ||
1042 src1.type == BRW_REGISTER_TYPE_UD) {
1043 assert(dest.type != BRW_REGISTER_TYPE_F);
1044 }
1045
1046 if (src0.type == BRW_REGISTER_TYPE_F ||
1047 (src0.file == BRW_IMMEDIATE_VALUE &&
1048 src0.type == BRW_REGISTER_TYPE_VF)) {
1049 assert(src1.type != BRW_REGISTER_TYPE_UD);
1050 assert(src1.type != BRW_REGISTER_TYPE_D);
1051 }
1052
1053 if (src1.type == BRW_REGISTER_TYPE_F ||
1054 (src1.file == BRW_IMMEDIATE_VALUE &&
1055 src1.type == BRW_REGISTER_TYPE_VF)) {
1056 assert(src0.type != BRW_REGISTER_TYPE_UD);
1057 assert(src0.type != BRW_REGISTER_TYPE_D);
1058 }
1059
1060 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1061 src0.nr != BRW_ARF_ACCUMULATOR);
1062 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1063 src1.nr != BRW_ARF_ACCUMULATOR);
1064
1065 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1066 }
1067
1068 brw_inst *
1069 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1070 struct brw_reg src0, struct brw_reg src1)
1071 {
1072 src0.vstride = BRW_VERTICAL_STRIDE_0;
1073 src0.width = BRW_WIDTH_1;
1074 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1075 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1076 }
1077
1078 brw_inst *
1079 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1080 struct brw_reg src0, struct brw_reg src1)
1081 {
1082 src0.vstride = BRW_VERTICAL_STRIDE_0;
1083 src0.width = BRW_WIDTH_1;
1084 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1085 src1.vstride = BRW_VERTICAL_STRIDE_8;
1086 src1.width = BRW_WIDTH_8;
1087 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1088 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1089 }
1090
1091 brw_inst *
1092 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1093 {
1094 const struct gen_device_info *devinfo = p->devinfo;
1095 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1096 /* The F32TO16 instruction doesn't support 32-bit destination types in
1097 * Align1 mode, and neither does the Gen8 implementation in terms of a
1098 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1099 * an undocumented feature.
1100 */
1101 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1102 (!align16 || devinfo->gen >= 8));
1103 brw_inst *inst;
1104
1105 if (align16) {
1106 assert(dst.type == BRW_REGISTER_TYPE_UD);
1107 } else {
1108 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1109 dst.type == BRW_REGISTER_TYPE_W ||
1110 dst.type == BRW_REGISTER_TYPE_UW ||
1111 dst.type == BRW_REGISTER_TYPE_HF);
1112 }
1113
1114 brw_push_insn_state(p);
1115
1116 if (needs_zero_fill) {
1117 brw_set_default_access_mode(p, BRW_ALIGN_1);
1118 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1119 }
1120
1121 if (devinfo->gen >= 8) {
1122 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1123 } else {
1124 assert(devinfo->gen == 7);
1125 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1126 }
1127
1128 if (needs_zero_fill) {
1129 brw_inst_set_no_dd_clear(devinfo, inst, true);
1130 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1131 brw_inst_set_no_dd_check(devinfo, inst, true);
1132 }
1133
1134 brw_pop_insn_state(p);
1135 return inst;
1136 }
1137
1138 brw_inst *
1139 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1140 {
1141 const struct gen_device_info *devinfo = p->devinfo;
1142 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1143
1144 if (align16) {
1145 assert(src.type == BRW_REGISTER_TYPE_UD);
1146 } else {
1147 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1148 *
1149 * Because this instruction does not have a 16-bit floating-point
1150 * type, the source data type must be Word (W). The destination type
1151 * must be F (Float).
1152 */
1153 if (src.type == BRW_REGISTER_TYPE_UD)
1154 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1155
1156 assert(src.type == BRW_REGISTER_TYPE_W ||
1157 src.type == BRW_REGISTER_TYPE_UW ||
1158 src.type == BRW_REGISTER_TYPE_HF);
1159 }
1160
1161 if (devinfo->gen >= 8) {
1162 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1163 } else {
1164 assert(devinfo->gen == 7);
1165 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1166 }
1167 }
1168
1169
1170 void brw_NOP(struct brw_codegen *p)
1171 {
1172 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1173 memset(insn, 0, sizeof(*insn));
1174 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1175 }
1176
1177
1178
1179
1180
1181 /***********************************************************************
1182 * Comparisons, if/else/endif
1183 */
1184
1185 brw_inst *
1186 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1187 unsigned predicate_control)
1188 {
1189 const struct gen_device_info *devinfo = p->devinfo;
1190 struct brw_reg ip = brw_ip_reg();
1191 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1192
1193 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1194 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1195 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1196 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1197
1198 return inst;
1199 }
1200
1201 static void
1202 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1203 {
1204 p->if_stack[p->if_stack_depth] = inst - p->store;
1205
1206 p->if_stack_depth++;
1207 if (p->if_stack_array_size <= p->if_stack_depth) {
1208 p->if_stack_array_size *= 2;
1209 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1210 p->if_stack_array_size);
1211 }
1212 }
1213
1214 static brw_inst *
1215 pop_if_stack(struct brw_codegen *p)
1216 {
1217 p->if_stack_depth--;
1218 return &p->store[p->if_stack[p->if_stack_depth]];
1219 }
1220
1221 static void
1222 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1223 {
1224 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1225 p->loop_stack_array_size *= 2;
1226 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1227 p->loop_stack_array_size);
1228 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1229 p->loop_stack_array_size);
1230 }
1231
1232 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1233 p->loop_stack_depth++;
1234 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1235 }
1236
1237 static brw_inst *
1238 get_inner_do_insn(struct brw_codegen *p)
1239 {
1240 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1241 }
1242
1243 /* EU takes the value from the flag register and pushes it onto some
1244 * sort of a stack (presumably merging with any flag value already on
1245 * the stack). Within an if block, the flags at the top of the stack
1246 * control execution on each channel of the unit, eg. on each of the
1247 * 16 pixel values in our wm programs.
1248 *
1249 * When the matching 'else' instruction is reached (presumably by
1250 * countdown of the instruction count patched in by our ELSE/ENDIF
1251 * functions), the relevant flags are inverted.
1252 *
1253 * When the matching 'endif' instruction is reached, the flags are
1254 * popped off. If the stack is now empty, normal execution resumes.
1255 */
1256 brw_inst *
1257 brw_IF(struct brw_codegen *p, unsigned execute_size)
1258 {
1259 const struct gen_device_info *devinfo = p->devinfo;
1260 brw_inst *insn;
1261
1262 insn = next_insn(p, BRW_OPCODE_IF);
1263
1264 /* Override the defaults for this instruction:
1265 */
1266 if (devinfo->gen < 6) {
1267 brw_set_dest(p, insn, brw_ip_reg());
1268 brw_set_src0(p, insn, brw_ip_reg());
1269 brw_set_src1(p, insn, brw_imm_d(0x0));
1270 } else if (devinfo->gen == 6) {
1271 brw_set_dest(p, insn, brw_imm_w(0));
1272 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1273 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1274 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1275 } else if (devinfo->gen == 7) {
1276 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1277 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1278 brw_set_src1(p, insn, brw_imm_w(0));
1279 brw_inst_set_jip(devinfo, insn, 0);
1280 brw_inst_set_uip(devinfo, insn, 0);
1281 } else {
1282 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1283 brw_set_src0(p, insn, brw_imm_d(0));
1284 brw_inst_set_jip(devinfo, insn, 0);
1285 brw_inst_set_uip(devinfo, insn, 0);
1286 }
1287
1288 brw_inst_set_exec_size(devinfo, insn, execute_size);
1289 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1290 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1291 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1292 if (!p->single_program_flow && devinfo->gen < 6)
1293 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1294
1295 push_if_stack(p, insn);
1296 p->if_depth_in_loop[p->loop_stack_depth]++;
1297 return insn;
1298 }
1299
1300 /* This function is only used for gen6-style IF instructions with an
1301 * embedded comparison (conditional modifier). It is not used on gen7.
1302 */
1303 brw_inst *
1304 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1305 struct brw_reg src0, struct brw_reg src1)
1306 {
1307 const struct gen_device_info *devinfo = p->devinfo;
1308 brw_inst *insn;
1309
1310 insn = next_insn(p, BRW_OPCODE_IF);
1311
1312 brw_set_dest(p, insn, brw_imm_w(0));
1313 brw_inst_set_exec_size(devinfo, insn,
1314 brw_inst_exec_size(devinfo, p->current));
1315 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1316 brw_set_src0(p, insn, src0);
1317 brw_set_src1(p, insn, src1);
1318
1319 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1320 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1321 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1322
1323 push_if_stack(p, insn);
1324 return insn;
1325 }
1326
1327 /**
1328 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1329 */
1330 static void
1331 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1332 brw_inst *if_inst, brw_inst *else_inst)
1333 {
1334 const struct gen_device_info *devinfo = p->devinfo;
1335
1336 /* The next instruction (where the ENDIF would be, if it existed) */
1337 brw_inst *next_inst = &p->store[p->nr_insn];
1338
1339 assert(p->single_program_flow);
1340 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1341 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1342 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1343
1344 /* Convert IF to an ADD instruction that moves the instruction pointer
1345 * to the first instruction of the ELSE block. If there is no ELSE
1346 * block, point to where ENDIF would be. Reverse the predicate.
1347 *
1348 * There's no need to execute an ENDIF since we don't need to do any
1349 * stack operations, and if we're currently executing, we just want to
1350 * continue normally.
1351 */
1352 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1353 brw_inst_set_pred_inv(devinfo, if_inst, true);
1354
1355 if (else_inst != NULL) {
1356 /* Convert ELSE to an ADD instruction that points where the ENDIF
1357 * would be.
1358 */
1359 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1360
1361 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1362 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1363 } else {
1364 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1365 }
1366 }
1367
1368 /**
1369 * Patch IF and ELSE instructions with appropriate jump targets.
1370 */
1371 static void
1372 patch_IF_ELSE(struct brw_codegen *p,
1373 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1374 {
1375 const struct gen_device_info *devinfo = p->devinfo;
1376
1377 /* We shouldn't be patching IF and ELSE instructions in single program flow
1378 * mode when gen < 6, because in single program flow mode on those
1379 * platforms, we convert flow control instructions to conditional ADDs that
1380 * operate on IP (see brw_ENDIF).
1381 *
1382 * However, on Gen6, writing to IP doesn't work in single program flow mode
1383 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1384 * not be updated by non-flow control instructions."). And on later
1385 * platforms, there is no significant benefit to converting control flow
1386 * instructions to conditional ADDs. So we do patch IF and ELSE
1387 * instructions in single program flow mode on those platforms.
1388 */
1389 if (devinfo->gen < 6)
1390 assert(!p->single_program_flow);
1391
1392 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1393 assert(endif_inst != NULL);
1394 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1395
1396 unsigned br = brw_jump_scale(devinfo);
1397
1398 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1399 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1400
1401 if (else_inst == NULL) {
1402 /* Patch IF -> ENDIF */
1403 if (devinfo->gen < 6) {
1404 /* Turn it into an IFF, which means no mask stack operations for
1405 * all-false and jumping past the ENDIF.
1406 */
1407 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1408 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1409 br * (endif_inst - if_inst + 1));
1410 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1411 } else if (devinfo->gen == 6) {
1412 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1413 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1414 } else {
1415 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1416 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1417 }
1418 } else {
1419 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1420
1421 /* Patch IF -> ELSE */
1422 if (devinfo->gen < 6) {
1423 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1424 br * (else_inst - if_inst));
1425 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1426 } else if (devinfo->gen == 6) {
1427 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1428 br * (else_inst - if_inst + 1));
1429 }
1430
1431 /* Patch ELSE -> ENDIF */
1432 if (devinfo->gen < 6) {
1433 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1434 * matching ENDIF.
1435 */
1436 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1437 br * (endif_inst - else_inst + 1));
1438 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1439 } else if (devinfo->gen == 6) {
1440 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1441 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1442 br * (endif_inst - else_inst));
1443 } else {
1444 /* The IF instruction's JIP should point just past the ELSE */
1445 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1446 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1447 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1448 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1449 if (devinfo->gen >= 8) {
1450 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1451 * should point to ENDIF.
1452 */
1453 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1454 }
1455 }
1456 }
1457 }
1458
1459 void
1460 brw_ELSE(struct brw_codegen *p)
1461 {
1462 const struct gen_device_info *devinfo = p->devinfo;
1463 brw_inst *insn;
1464
1465 insn = next_insn(p, BRW_OPCODE_ELSE);
1466
1467 if (devinfo->gen < 6) {
1468 brw_set_dest(p, insn, brw_ip_reg());
1469 brw_set_src0(p, insn, brw_ip_reg());
1470 brw_set_src1(p, insn, brw_imm_d(0x0));
1471 } else if (devinfo->gen == 6) {
1472 brw_set_dest(p, insn, brw_imm_w(0));
1473 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1474 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1475 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1476 } else if (devinfo->gen == 7) {
1477 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1478 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1479 brw_set_src1(p, insn, brw_imm_w(0));
1480 brw_inst_set_jip(devinfo, insn, 0);
1481 brw_inst_set_uip(devinfo, insn, 0);
1482 } else {
1483 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1484 brw_set_src0(p, insn, brw_imm_d(0));
1485 brw_inst_set_jip(devinfo, insn, 0);
1486 brw_inst_set_uip(devinfo, insn, 0);
1487 }
1488
1489 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1490 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1491 if (!p->single_program_flow && devinfo->gen < 6)
1492 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1493
1494 push_if_stack(p, insn);
1495 }
1496
1497 void
1498 brw_ENDIF(struct brw_codegen *p)
1499 {
1500 const struct gen_device_info *devinfo = p->devinfo;
1501 brw_inst *insn = NULL;
1502 brw_inst *else_inst = NULL;
1503 brw_inst *if_inst = NULL;
1504 brw_inst *tmp;
1505 bool emit_endif = true;
1506
1507 /* In single program flow mode, we can express IF and ELSE instructions
1508 * equivalently as ADD instructions that operate on IP. On platforms prior
1509 * to Gen6, flow control instructions cause an implied thread switch, so
1510 * this is a significant savings.
1511 *
1512 * However, on Gen6, writing to IP doesn't work in single program flow mode
1513 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1514 * not be updated by non-flow control instructions."). And on later
1515 * platforms, there is no significant benefit to converting control flow
1516 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1517 * Gen5.
1518 */
1519 if (devinfo->gen < 6 && p->single_program_flow)
1520 emit_endif = false;
1521
1522 /*
1523 * A single next_insn() may change the base address of instruction store
1524 * memory(p->store), so call it first before referencing the instruction
1525 * store pointer from an index
1526 */
1527 if (emit_endif)
1528 insn = next_insn(p, BRW_OPCODE_ENDIF);
1529
1530 /* Pop the IF and (optional) ELSE instructions from the stack */
1531 p->if_depth_in_loop[p->loop_stack_depth]--;
1532 tmp = pop_if_stack(p);
1533 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1534 else_inst = tmp;
1535 tmp = pop_if_stack(p);
1536 }
1537 if_inst = tmp;
1538
1539 if (!emit_endif) {
1540 /* ENDIF is useless; don't bother emitting it. */
1541 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1542 return;
1543 }
1544
1545 if (devinfo->gen < 6) {
1546 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1548 brw_set_src1(p, insn, brw_imm_d(0x0));
1549 } else if (devinfo->gen == 6) {
1550 brw_set_dest(p, insn, brw_imm_w(0));
1551 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1552 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1553 } else if (devinfo->gen == 7) {
1554 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1555 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1556 brw_set_src1(p, insn, brw_imm_w(0));
1557 } else {
1558 brw_set_src0(p, insn, brw_imm_d(0));
1559 }
1560
1561 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1562 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1563 if (devinfo->gen < 6)
1564 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1565
1566 /* Also pop item off the stack in the endif instruction: */
1567 if (devinfo->gen < 6) {
1568 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1569 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1570 } else if (devinfo->gen == 6) {
1571 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1572 } else {
1573 brw_inst_set_jip(devinfo, insn, 2);
1574 }
1575 patch_IF_ELSE(p, if_inst, else_inst, insn);
1576 }
1577
1578 brw_inst *
1579 brw_BREAK(struct brw_codegen *p)
1580 {
1581 const struct gen_device_info *devinfo = p->devinfo;
1582 brw_inst *insn;
1583
1584 insn = next_insn(p, BRW_OPCODE_BREAK);
1585 if (devinfo->gen >= 8) {
1586 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1587 brw_set_src0(p, insn, brw_imm_d(0x0));
1588 } else if (devinfo->gen >= 6) {
1589 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1590 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1591 brw_set_src1(p, insn, brw_imm_d(0x0));
1592 } else {
1593 brw_set_dest(p, insn, brw_ip_reg());
1594 brw_set_src0(p, insn, brw_ip_reg());
1595 brw_set_src1(p, insn, brw_imm_d(0x0));
1596 brw_inst_set_gen4_pop_count(devinfo, insn,
1597 p->if_depth_in_loop[p->loop_stack_depth]);
1598 }
1599 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1600 brw_inst_set_exec_size(devinfo, insn,
1601 brw_inst_exec_size(devinfo, p->current));
1602
1603 return insn;
1604 }
1605
1606 brw_inst *
1607 brw_CONT(struct brw_codegen *p)
1608 {
1609 const struct gen_device_info *devinfo = p->devinfo;
1610 brw_inst *insn;
1611
1612 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1613 brw_set_dest(p, insn, brw_ip_reg());
1614 if (devinfo->gen >= 8) {
1615 brw_set_src0(p, insn, brw_imm_d(0x0));
1616 } else {
1617 brw_set_src0(p, insn, brw_ip_reg());
1618 brw_set_src1(p, insn, brw_imm_d(0x0));
1619 }
1620
1621 if (devinfo->gen < 6) {
1622 brw_inst_set_gen4_pop_count(devinfo, insn,
1623 p->if_depth_in_loop[p->loop_stack_depth]);
1624 }
1625 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1626 brw_inst_set_exec_size(devinfo, insn,
1627 brw_inst_exec_size(devinfo, p->current));
1628 return insn;
1629 }
1630
1631 brw_inst *
1632 gen6_HALT(struct brw_codegen *p)
1633 {
1634 const struct gen_device_info *devinfo = p->devinfo;
1635 brw_inst *insn;
1636
1637 insn = next_insn(p, BRW_OPCODE_HALT);
1638 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1639 if (devinfo->gen >= 8) {
1640 brw_set_src0(p, insn, brw_imm_d(0x0));
1641 } else {
1642 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1643 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1644 }
1645
1646 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1647 brw_inst_set_exec_size(devinfo, insn,
1648 brw_inst_exec_size(devinfo, p->current));
1649 return insn;
1650 }
1651
1652 /* DO/WHILE loop:
1653 *
1654 * The DO/WHILE is just an unterminated loop -- break or continue are
1655 * used for control within the loop. We have a few ways they can be
1656 * done.
1657 *
1658 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1659 * jip and no DO instruction.
1660 *
1661 * For non-uniform control flow pre-gen6, there's a DO instruction to
1662 * push the mask, and a WHILE to jump back, and BREAK to get out and
1663 * pop the mask.
1664 *
1665 * For gen6, there's no more mask stack, so no need for DO. WHILE
1666 * just points back to the first instruction of the loop.
1667 */
1668 brw_inst *
1669 brw_DO(struct brw_codegen *p, unsigned execute_size)
1670 {
1671 const struct gen_device_info *devinfo = p->devinfo;
1672
1673 if (devinfo->gen >= 6 || p->single_program_flow) {
1674 push_loop_stack(p, &p->store[p->nr_insn]);
1675 return &p->store[p->nr_insn];
1676 } else {
1677 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1678
1679 push_loop_stack(p, insn);
1680
1681 /* Override the defaults for this instruction:
1682 */
1683 brw_set_dest(p, insn, brw_null_reg());
1684 brw_set_src0(p, insn, brw_null_reg());
1685 brw_set_src1(p, insn, brw_null_reg());
1686
1687 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1688 brw_inst_set_exec_size(devinfo, insn, execute_size);
1689 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1690
1691 return insn;
1692 }
1693 }
1694
1695 /**
1696 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1697 * instruction here.
1698 *
1699 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1700 * nesting, since it can always just point to the end of the block/current loop.
1701 */
1702 static void
1703 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1704 {
1705 const struct gen_device_info *devinfo = p->devinfo;
1706 brw_inst *do_inst = get_inner_do_insn(p);
1707 brw_inst *inst;
1708 unsigned br = brw_jump_scale(devinfo);
1709
1710 assert(devinfo->gen < 6);
1711
1712 for (inst = while_inst - 1; inst != do_inst; inst--) {
1713 /* If the jump count is != 0, that means that this instruction has already
1714 * been patched because it's part of a loop inside of the one we're
1715 * patching.
1716 */
1717 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1718 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1719 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1720 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1721 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1722 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1723 }
1724 }
1725 }
1726
1727 brw_inst *
1728 brw_WHILE(struct brw_codegen *p)
1729 {
1730 const struct gen_device_info *devinfo = p->devinfo;
1731 brw_inst *insn, *do_insn;
1732 unsigned br = brw_jump_scale(devinfo);
1733
1734 if (devinfo->gen >= 6) {
1735 insn = next_insn(p, BRW_OPCODE_WHILE);
1736 do_insn = get_inner_do_insn(p);
1737
1738 if (devinfo->gen >= 8) {
1739 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1740 brw_set_src0(p, insn, brw_imm_d(0));
1741 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1742 } else if (devinfo->gen == 7) {
1743 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1744 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1745 brw_set_src1(p, insn, brw_imm_w(0));
1746 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1747 } else {
1748 brw_set_dest(p, insn, brw_imm_w(0));
1749 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1750 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1751 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1752 }
1753
1754 brw_inst_set_exec_size(devinfo, insn,
1755 brw_inst_exec_size(devinfo, p->current));
1756
1757 } else {
1758 if (p->single_program_flow) {
1759 insn = next_insn(p, BRW_OPCODE_ADD);
1760 do_insn = get_inner_do_insn(p);
1761
1762 brw_set_dest(p, insn, brw_ip_reg());
1763 brw_set_src0(p, insn, brw_ip_reg());
1764 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1765 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1766 } else {
1767 insn = next_insn(p, BRW_OPCODE_WHILE);
1768 do_insn = get_inner_do_insn(p);
1769
1770 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1771
1772 brw_set_dest(p, insn, brw_ip_reg());
1773 brw_set_src0(p, insn, brw_ip_reg());
1774 brw_set_src1(p, insn, brw_imm_d(0));
1775
1776 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1777 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1778 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1779
1780 brw_patch_break_cont(p, insn);
1781 }
1782 }
1783 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1784
1785 p->loop_stack_depth--;
1786
1787 return insn;
1788 }
1789
1790 /* FORWARD JUMPS:
1791 */
1792 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1793 {
1794 const struct gen_device_info *devinfo = p->devinfo;
1795 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1796 unsigned jmpi = 1;
1797
1798 if (devinfo->gen >= 5)
1799 jmpi = 2;
1800
1801 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1802 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1803
1804 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1805 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1806 }
1807
1808 /* To integrate with the above, it makes sense that the comparison
1809 * instruction should populate the flag register. It might be simpler
1810 * just to use the flag reg for most WM tasks?
1811 */
1812 void brw_CMP(struct brw_codegen *p,
1813 struct brw_reg dest,
1814 unsigned conditional,
1815 struct brw_reg src0,
1816 struct brw_reg src1)
1817 {
1818 const struct gen_device_info *devinfo = p->devinfo;
1819 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1820
1821 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1822 brw_set_dest(p, insn, dest);
1823 brw_set_src0(p, insn, src0);
1824 brw_set_src1(p, insn, src1);
1825
1826 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1827 * page says:
1828 * "Any CMP instruction with a null destination must use a {switch}."
1829 *
1830 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1831 * mentioned on their work-arounds pages.
1832 */
1833 if (devinfo->gen == 7) {
1834 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1835 dest.nr == BRW_ARF_NULL) {
1836 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1837 }
1838 }
1839 }
1840
1841 /***********************************************************************
1842 * Helpers for the various SEND message types:
1843 */
1844
1845 /** Extended math function, float[8].
1846 */
1847 void gen4_math(struct brw_codegen *p,
1848 struct brw_reg dest,
1849 unsigned function,
1850 unsigned msg_reg_nr,
1851 struct brw_reg src,
1852 unsigned precision )
1853 {
1854 const struct gen_device_info *devinfo = p->devinfo;
1855 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1856 unsigned data_type;
1857 if (has_scalar_region(src)) {
1858 data_type = BRW_MATH_DATA_SCALAR;
1859 } else {
1860 data_type = BRW_MATH_DATA_VECTOR;
1861 }
1862
1863 assert(devinfo->gen < 6);
1864
1865 /* Example code doesn't set predicate_control for send
1866 * instructions.
1867 */
1868 brw_inst_set_pred_control(devinfo, insn, 0);
1869 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1870
1871 brw_set_dest(p, insn, dest);
1872 brw_set_src0(p, insn, src);
1873 brw_set_math_message(p,
1874 insn,
1875 function,
1876 src.type == BRW_REGISTER_TYPE_D,
1877 precision,
1878 data_type);
1879 }
1880
1881 void gen6_math(struct brw_codegen *p,
1882 struct brw_reg dest,
1883 unsigned function,
1884 struct brw_reg src0,
1885 struct brw_reg src1)
1886 {
1887 const struct gen_device_info *devinfo = p->devinfo;
1888 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1889
1890 assert(devinfo->gen >= 6);
1891
1892 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1893 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1894
1895 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1896 if (devinfo->gen == 6) {
1897 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1898 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1899 }
1900
1901 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1902 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1903 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1904 assert(src0.type != BRW_REGISTER_TYPE_F);
1905 assert(src1.type != BRW_REGISTER_TYPE_F);
1906 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1907 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1908 } else {
1909 assert(src0.type == BRW_REGISTER_TYPE_F);
1910 assert(src1.type == BRW_REGISTER_TYPE_F);
1911 }
1912
1913 /* Source modifiers are ignored for extended math instructions on Gen6. */
1914 if (devinfo->gen == 6) {
1915 assert(!src0.negate);
1916 assert(!src0.abs);
1917 assert(!src1.negate);
1918 assert(!src1.abs);
1919 }
1920
1921 brw_inst_set_math_function(devinfo, insn, function);
1922
1923 brw_set_dest(p, insn, dest);
1924 brw_set_src0(p, insn, src0);
1925 brw_set_src1(p, insn, src1);
1926 }
1927
1928 /**
1929 * Return the right surface index to access the thread scratch space using
1930 * stateless dataport messages.
1931 */
1932 unsigned
1933 brw_scratch_surface_idx(const struct brw_codegen *p)
1934 {
1935 /* The scratch space is thread-local so IA coherency is unnecessary. */
1936 if (p->devinfo->gen >= 8)
1937 return GEN8_BTI_STATELESS_NON_COHERENT;
1938 else
1939 return BRW_BTI_STATELESS;
1940 }
1941
1942 /**
1943 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1944 * using a constant offset per channel.
1945 *
1946 * The offset must be aligned to oword size (16 bytes). Used for
1947 * register spilling.
1948 */
1949 void brw_oword_block_write_scratch(struct brw_codegen *p,
1950 struct brw_reg mrf,
1951 int num_regs,
1952 unsigned offset)
1953 {
1954 const struct gen_device_info *devinfo = p->devinfo;
1955 const unsigned target_cache =
1956 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1957 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1958 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
1959 uint32_t msg_type;
1960
1961 if (devinfo->gen >= 6)
1962 offset /= 16;
1963
1964 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1965
1966 const unsigned mlen = 1 + num_regs;
1967
1968 /* Set up the message header. This is g0, with g0.2 filled with
1969 * the offset. We don't want to leave our offset around in g0 or
1970 * it'll screw up texture samples, so set it up inside the message
1971 * reg.
1972 */
1973 {
1974 brw_push_insn_state(p);
1975 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1976 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1977 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1978
1979 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1980
1981 /* set message header global offset field (reg 0, element 2) */
1982 brw_MOV(p,
1983 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1984 mrf.nr,
1985 2), BRW_REGISTER_TYPE_UD),
1986 brw_imm_ud(offset));
1987
1988 brw_pop_insn_state(p);
1989 }
1990
1991 {
1992 struct brw_reg dest;
1993 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1994 int send_commit_msg;
1995 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1996 BRW_REGISTER_TYPE_UW);
1997
1998 brw_inst_set_compression(devinfo, insn, false);
1999
2000 if (brw_inst_exec_size(devinfo, insn) >= 16)
2001 src_header = vec16(src_header);
2002
2003 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2004 if (devinfo->gen < 6)
2005 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2006
2007 /* Until gen6, writes followed by reads from the same location
2008 * are not guaranteed to be ordered unless write_commit is set.
2009 * If set, then a no-op write is issued to the destination
2010 * register to set a dependency, and a read from the destination
2011 * can be used to ensure the ordering.
2012 *
2013 * For gen6, only writes between different threads need ordering
2014 * protection. Our use of DP writes is all about register
2015 * spilling within a thread.
2016 */
2017 if (devinfo->gen >= 6) {
2018 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2019 send_commit_msg = 0;
2020 } else {
2021 dest = src_header;
2022 send_commit_msg = 1;
2023 }
2024
2025 brw_set_dest(p, insn, dest);
2026 if (devinfo->gen >= 6) {
2027 brw_set_src0(p, insn, mrf);
2028 } else {
2029 brw_set_src0(p, insn, brw_null_reg());
2030 }
2031
2032 if (devinfo->gen >= 6)
2033 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2034 else
2035 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2036
2037 brw_set_dp_write_message(p,
2038 insn,
2039 brw_scratch_surface_idx(p),
2040 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2041 msg_type,
2042 target_cache,
2043 mlen,
2044 true, /* header_present */
2045 0, /* not a render target */
2046 send_commit_msg, /* response_length */
2047 0, /* eot */
2048 send_commit_msg);
2049 }
2050 }
2051
2052
2053 /**
2054 * Read a block of owords (half a GRF each) from the scratch buffer
2055 * using a constant index per channel.
2056 *
2057 * Offset must be aligned to oword size (16 bytes). Used for register
2058 * spilling.
2059 */
2060 void
2061 brw_oword_block_read_scratch(struct brw_codegen *p,
2062 struct brw_reg dest,
2063 struct brw_reg mrf,
2064 int num_regs,
2065 unsigned offset)
2066 {
2067 const struct gen_device_info *devinfo = p->devinfo;
2068
2069 if (devinfo->gen >= 6)
2070 offset /= 16;
2071
2072 if (p->devinfo->gen >= 7) {
2073 /* On gen 7 and above, we no longer have message registers and we can
2074 * send from any register we want. By using the destination register
2075 * for the message, we guarantee that the implied message write won't
2076 * accidentally overwrite anything. This has been a problem because
2077 * the MRF registers and source for the final FB write are both fixed
2078 * and may overlap.
2079 */
2080 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2081 } else {
2082 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2083 }
2084 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2085
2086 const unsigned rlen = num_regs;
2087 const unsigned target_cache =
2088 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2089 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2090 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2091
2092 {
2093 brw_push_insn_state(p);
2094 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2095 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2096 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2097
2098 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2099
2100 /* set message header global offset field (reg 0, element 2) */
2101 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2102
2103 brw_pop_insn_state(p);
2104 }
2105
2106 {
2107 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2108
2109 assert(brw_inst_pred_control(devinfo, insn) == 0);
2110 brw_inst_set_compression(devinfo, insn, false);
2111
2112 brw_set_dest(p, insn, dest); /* UW? */
2113 if (devinfo->gen >= 6) {
2114 brw_set_src0(p, insn, mrf);
2115 } else {
2116 brw_set_src0(p, insn, brw_null_reg());
2117 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2118 }
2119
2120 brw_set_dp_read_message(p,
2121 insn,
2122 brw_scratch_surface_idx(p),
2123 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2124 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2125 target_cache,
2126 1, /* msg_length */
2127 true, /* header_present */
2128 rlen);
2129 }
2130 }
2131
2132 void
2133 gen7_block_read_scratch(struct brw_codegen *p,
2134 struct brw_reg dest,
2135 int num_regs,
2136 unsigned offset)
2137 {
2138 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2139 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2140
2141 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2142
2143 /* The HW requires that the header is present; this is to get the g0.5
2144 * scratch offset.
2145 */
2146 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2147
2148 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2149 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2150 * is 32 bytes, which happens to be the size of a register.
2151 */
2152 offset /= REG_SIZE;
2153 assert(offset < (1 << 12));
2154
2155 gen7_set_dp_scratch_message(p, insn,
2156 false, /* scratch read */
2157 false, /* OWords */
2158 false, /* invalidate after read */
2159 num_regs,
2160 offset,
2161 1, /* mlen: just g0 */
2162 num_regs, /* rlen */
2163 true); /* header present */
2164 }
2165
2166 /**
2167 * Read float[4] vectors from the data port constant cache.
2168 * Location (in buffer) should be a multiple of 16.
2169 * Used for fetching shader constants.
2170 */
2171 void brw_oword_block_read(struct brw_codegen *p,
2172 struct brw_reg dest,
2173 struct brw_reg mrf,
2174 uint32_t offset,
2175 uint32_t bind_table_index)
2176 {
2177 const struct gen_device_info *devinfo = p->devinfo;
2178 const unsigned target_cache =
2179 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2180 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2181 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
2182
2183 /* On newer hardware, offset is in units of owords. */
2184 if (devinfo->gen >= 6)
2185 offset /= 16;
2186
2187 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2188
2189 brw_push_insn_state(p);
2190 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2191 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2192 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2193
2194 brw_push_insn_state(p);
2195 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2196 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2197
2198 /* set message header global offset field (reg 0, element 2) */
2199 brw_MOV(p,
2200 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2201 mrf.nr,
2202 2), BRW_REGISTER_TYPE_UD),
2203 brw_imm_ud(offset));
2204 brw_pop_insn_state(p);
2205
2206 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2207
2208 /* cast dest to a uword[8] vector */
2209 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2210
2211 brw_set_dest(p, insn, dest);
2212 if (devinfo->gen >= 6) {
2213 brw_set_src0(p, insn, mrf);
2214 } else {
2215 brw_set_src0(p, insn, brw_null_reg());
2216 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2217 }
2218
2219 brw_set_dp_read_message(p, insn, bind_table_index,
2220 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2221 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2222 target_cache,
2223 1, /* msg_length */
2224 true, /* header_present */
2225 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2226
2227 brw_pop_insn_state(p);
2228 }
2229
2230
2231 void brw_fb_WRITE(struct brw_codegen *p,
2232 struct brw_reg payload,
2233 struct brw_reg implied_header,
2234 unsigned msg_control,
2235 unsigned binding_table_index,
2236 unsigned msg_length,
2237 unsigned response_length,
2238 bool eot,
2239 bool last_render_target,
2240 bool header_present)
2241 {
2242 const struct gen_device_info *devinfo = p->devinfo;
2243 const unsigned target_cache =
2244 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2245 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2246 brw_inst *insn;
2247 unsigned msg_type;
2248 struct brw_reg dest, src0;
2249
2250 if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2251 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2252 else
2253 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2254
2255 if (devinfo->gen >= 6) {
2256 insn = next_insn(p, BRW_OPCODE_SENDC);
2257 } else {
2258 insn = next_insn(p, BRW_OPCODE_SEND);
2259 }
2260 brw_inst_set_compression(devinfo, insn, false);
2261
2262 if (devinfo->gen >= 6) {
2263 /* headerless version, just submit color payload */
2264 src0 = payload;
2265
2266 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2267 } else {
2268 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2269 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2270 src0 = implied_header;
2271
2272 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2273 }
2274
2275 brw_set_dest(p, insn, dest);
2276 brw_set_src0(p, insn, src0);
2277 brw_set_dp_write_message(p,
2278 insn,
2279 binding_table_index,
2280 msg_control,
2281 msg_type,
2282 target_cache,
2283 msg_length,
2284 header_present,
2285 last_render_target,
2286 response_length,
2287 eot,
2288 0 /* send_commit_msg */);
2289 }
2290
2291 brw_inst *
2292 gen9_fb_READ(struct brw_codegen *p,
2293 struct brw_reg dst,
2294 struct brw_reg payload,
2295 unsigned binding_table_index,
2296 unsigned msg_length,
2297 unsigned response_length,
2298 bool per_sample)
2299 {
2300 const struct gen_device_info *devinfo = p->devinfo;
2301 assert(devinfo->gen >= 9);
2302 const unsigned msg_subtype =
2303 brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2304 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2305
2306 brw_set_dest(p, insn, dst);
2307 brw_set_src0(p, insn, payload);
2308 brw_set_dp_read_message(p, insn, binding_table_index,
2309 per_sample << 5 | msg_subtype,
2310 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2311 GEN6_SFID_DATAPORT_RENDER_CACHE,
2312 msg_length, true /* header_present */,
2313 response_length);
2314 brw_inst_set_rt_slot_group(devinfo, insn,
2315 brw_inst_qtr_control(devinfo, p->current) / 2);
2316
2317 return insn;
2318 }
2319
2320 /**
2321 * Texture sample instruction.
2322 * Note: the msg_type plus msg_length values determine exactly what kind
2323 * of sampling operation is performed. See volume 4, page 161 of docs.
2324 */
2325 void brw_SAMPLE(struct brw_codegen *p,
2326 struct brw_reg dest,
2327 unsigned msg_reg_nr,
2328 struct brw_reg src0,
2329 unsigned binding_table_index,
2330 unsigned sampler,
2331 unsigned msg_type,
2332 unsigned response_length,
2333 unsigned msg_length,
2334 unsigned header_present,
2335 unsigned simd_mode,
2336 unsigned return_format)
2337 {
2338 const struct gen_device_info *devinfo = p->devinfo;
2339 brw_inst *insn;
2340
2341 if (msg_reg_nr != -1)
2342 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2343
2344 insn = next_insn(p, BRW_OPCODE_SEND);
2345 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2346
2347 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2348 *
2349 * "Instruction compression is not allowed for this instruction (that
2350 * is, send). The hardware behavior is undefined if this instruction is
2351 * set as compressed. However, compress control can be set to "SecHalf"
2352 * to affect the EMask generation."
2353 *
2354 * No similar wording is found in later PRMs, but there are examples
2355 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2356 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2357 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2358 */
2359 brw_inst_set_compression(devinfo, insn, false);
2360
2361 if (devinfo->gen < 6)
2362 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2363
2364 brw_set_dest(p, insn, dest);
2365 brw_set_src0(p, insn, src0);
2366 brw_set_sampler_message(p, insn,
2367 binding_table_index,
2368 sampler,
2369 msg_type,
2370 response_length,
2371 msg_length,
2372 header_present,
2373 simd_mode,
2374 return_format);
2375 }
2376
2377 /* Adjust the message header's sampler state pointer to
2378 * select the correct group of 16 samplers.
2379 */
2380 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2381 struct brw_reg header,
2382 struct brw_reg sampler_index)
2383 {
2384 /* The "Sampler Index" field can only store values between 0 and 15.
2385 * However, we can add an offset to the "Sampler State Pointer"
2386 * field, effectively selecting a different set of 16 samplers.
2387 *
2388 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2389 * offset, and each sampler state is only 16-bytes, so we can't
2390 * exclusively use the offset - we have to use both.
2391 */
2392
2393 const struct gen_device_info *devinfo = p->devinfo;
2394
2395 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2396 const int sampler_state_size = 16; /* 16 bytes */
2397 uint32_t sampler = sampler_index.ud;
2398
2399 if (sampler >= 16) {
2400 assert(devinfo->is_haswell || devinfo->gen >= 8);
2401 brw_ADD(p,
2402 get_element_ud(header, 3),
2403 get_element_ud(brw_vec8_grf(0, 0), 3),
2404 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2405 }
2406 } else {
2407 /* Non-const sampler array indexing case */
2408 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2409 return;
2410 }
2411
2412 struct brw_reg temp = get_element_ud(header, 3);
2413
2414 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2415 brw_SHL(p, temp, temp, brw_imm_ud(4));
2416 brw_ADD(p,
2417 get_element_ud(header, 3),
2418 get_element_ud(brw_vec8_grf(0, 0), 3),
2419 temp);
2420 }
2421 }
2422
2423 /* All these variables are pretty confusing - we might be better off
2424 * using bitmasks and macros for this, in the old style. Or perhaps
2425 * just having the caller instantiate the fields in dword3 itself.
2426 */
2427 void brw_urb_WRITE(struct brw_codegen *p,
2428 struct brw_reg dest,
2429 unsigned msg_reg_nr,
2430 struct brw_reg src0,
2431 enum brw_urb_write_flags flags,
2432 unsigned msg_length,
2433 unsigned response_length,
2434 unsigned offset,
2435 unsigned swizzle)
2436 {
2437 const struct gen_device_info *devinfo = p->devinfo;
2438 brw_inst *insn;
2439
2440 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2441
2442 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2443 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2444 brw_push_insn_state(p);
2445 brw_set_default_access_mode(p, BRW_ALIGN_1);
2446 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2447 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2448 BRW_REGISTER_TYPE_UD),
2449 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2450 brw_imm_ud(0xff00));
2451 brw_pop_insn_state(p);
2452 }
2453
2454 insn = next_insn(p, BRW_OPCODE_SEND);
2455
2456 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2457
2458 brw_set_dest(p, insn, dest);
2459 brw_set_src0(p, insn, src0);
2460 brw_set_src1(p, insn, brw_imm_d(0));
2461
2462 if (devinfo->gen < 6)
2463 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2464
2465 brw_set_urb_message(p,
2466 insn,
2467 flags,
2468 msg_length,
2469 response_length,
2470 offset,
2471 swizzle);
2472 }
2473
2474 struct brw_inst *
2475 brw_send_indirect_message(struct brw_codegen *p,
2476 unsigned sfid,
2477 struct brw_reg dst,
2478 struct brw_reg payload,
2479 struct brw_reg desc)
2480 {
2481 const struct gen_device_info *devinfo = p->devinfo;
2482 struct brw_inst *send;
2483 int setup;
2484
2485 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2486
2487 assert(desc.type == BRW_REGISTER_TYPE_UD);
2488
2489 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2490 * in the indirect case) by its index in the instruction store. The
2491 * pointer returned by next_insn() may become invalid if emitting the SEND
2492 * in the indirect case reallocs the store.
2493 */
2494
2495 if (desc.file == BRW_IMMEDIATE_VALUE) {
2496 setup = p->nr_insn;
2497 send = next_insn(p, BRW_OPCODE_SEND);
2498 brw_set_src1(p, send, desc);
2499
2500 } else {
2501 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2502
2503 brw_push_insn_state(p);
2504 brw_set_default_access_mode(p, BRW_ALIGN_1);
2505 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2506 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2507
2508 /* Load the indirect descriptor to an address register using OR so the
2509 * caller can specify additional descriptor bits with the usual
2510 * brw_set_*_message() helper functions.
2511 */
2512 setup = p->nr_insn;
2513 brw_OR(p, addr, desc, brw_imm_ud(0));
2514
2515 brw_pop_insn_state(p);
2516
2517 send = next_insn(p, BRW_OPCODE_SEND);
2518 brw_set_src1(p, send, addr);
2519 }
2520
2521 if (dst.width < BRW_EXECUTE_8)
2522 brw_inst_set_exec_size(devinfo, send, dst.width);
2523
2524 brw_set_dest(p, send, dst);
2525 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2526 brw_inst_set_sfid(devinfo, send, sfid);
2527
2528 return &p->store[setup];
2529 }
2530
2531 static struct brw_inst *
2532 brw_send_indirect_surface_message(struct brw_codegen *p,
2533 unsigned sfid,
2534 struct brw_reg dst,
2535 struct brw_reg payload,
2536 struct brw_reg surface,
2537 unsigned message_len,
2538 unsigned response_len,
2539 bool header_present)
2540 {
2541 const struct gen_device_info *devinfo = p->devinfo;
2542 struct brw_inst *insn;
2543
2544 if (surface.file != BRW_IMMEDIATE_VALUE) {
2545 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2546
2547 brw_push_insn_state(p);
2548 brw_set_default_access_mode(p, BRW_ALIGN_1);
2549 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2550 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2551
2552 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2553 * some surface array is accessed out of bounds.
2554 */
2555 insn = brw_AND(p, addr,
2556 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2557 BRW_GET_SWZ(surface.swizzle, 0)),
2558 brw_imm_ud(0xff));
2559
2560 brw_pop_insn_state(p);
2561
2562 surface = addr;
2563 }
2564
2565 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2566 brw_inst_set_mlen(devinfo, insn, message_len);
2567 brw_inst_set_rlen(devinfo, insn, response_len);
2568 brw_inst_set_header_present(devinfo, insn, header_present);
2569
2570 return insn;
2571 }
2572
2573 static bool
2574 while_jumps_before_offset(const struct gen_device_info *devinfo,
2575 brw_inst *insn, int while_offset, int start_offset)
2576 {
2577 int scale = 16 / brw_jump_scale(devinfo);
2578 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2579 : brw_inst_jip(devinfo, insn);
2580 assert(jip < 0);
2581 return while_offset + jip * scale <= start_offset;
2582 }
2583
2584
2585 static int
2586 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2587 {
2588 int offset;
2589 void *store = p->store;
2590 const struct gen_device_info *devinfo = p->devinfo;
2591
2592 int depth = 0;
2593
2594 for (offset = next_offset(devinfo, store, start_offset);
2595 offset < p->next_insn_offset;
2596 offset = next_offset(devinfo, store, offset)) {
2597 brw_inst *insn = store + offset;
2598
2599 switch (brw_inst_opcode(devinfo, insn)) {
2600 case BRW_OPCODE_IF:
2601 depth++;
2602 break;
2603 case BRW_OPCODE_ENDIF:
2604 if (depth == 0)
2605 return offset;
2606 depth--;
2607 break;
2608 case BRW_OPCODE_WHILE:
2609 /* If the while doesn't jump before our instruction, it's the end
2610 * of a sibling do...while loop. Ignore it.
2611 */
2612 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2613 continue;
2614 /* fallthrough */
2615 case BRW_OPCODE_ELSE:
2616 case BRW_OPCODE_HALT:
2617 if (depth == 0)
2618 return offset;
2619 }
2620 }
2621
2622 return 0;
2623 }
2624
2625 /* There is no DO instruction on gen6, so to find the end of the loop
2626 * we have to see if the loop is jumping back before our start
2627 * instruction.
2628 */
2629 static int
2630 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2631 {
2632 const struct gen_device_info *devinfo = p->devinfo;
2633 int offset;
2634 void *store = p->store;
2635
2636 assert(devinfo->gen >= 6);
2637
2638 /* Always start after the instruction (such as a WHILE) we're trying to fix
2639 * up.
2640 */
2641 for (offset = next_offset(devinfo, store, start_offset);
2642 offset < p->next_insn_offset;
2643 offset = next_offset(devinfo, store, offset)) {
2644 brw_inst *insn = store + offset;
2645
2646 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2647 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2648 return offset;
2649 }
2650 }
2651 assert(!"not reached");
2652 return start_offset;
2653 }
2654
2655 /* After program generation, go back and update the UIP and JIP of
2656 * BREAK, CONT, and HALT instructions to their correct locations.
2657 */
2658 void
2659 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2660 {
2661 const struct gen_device_info *devinfo = p->devinfo;
2662 int offset;
2663 int br = brw_jump_scale(devinfo);
2664 int scale = 16 / br;
2665 void *store = p->store;
2666
2667 if (devinfo->gen < 6)
2668 return;
2669
2670 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2671 brw_inst *insn = store + offset;
2672 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2673
2674 int block_end_offset = brw_find_next_block_end(p, offset);
2675 switch (brw_inst_opcode(devinfo, insn)) {
2676 case BRW_OPCODE_BREAK:
2677 assert(block_end_offset != 0);
2678 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2679 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2680 brw_inst_set_uip(devinfo, insn,
2681 (brw_find_loop_end(p, offset) - offset +
2682 (devinfo->gen == 6 ? 16 : 0)) / scale);
2683 break;
2684 case BRW_OPCODE_CONTINUE:
2685 assert(block_end_offset != 0);
2686 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2687 brw_inst_set_uip(devinfo, insn,
2688 (brw_find_loop_end(p, offset) - offset) / scale);
2689
2690 assert(brw_inst_uip(devinfo, insn) != 0);
2691 assert(brw_inst_jip(devinfo, insn) != 0);
2692 break;
2693
2694 case BRW_OPCODE_ENDIF: {
2695 int32_t jump = (block_end_offset == 0) ?
2696 1 * br : (block_end_offset - offset) / scale;
2697 if (devinfo->gen >= 7)
2698 brw_inst_set_jip(devinfo, insn, jump);
2699 else
2700 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2701 break;
2702 }
2703
2704 case BRW_OPCODE_HALT:
2705 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2706 *
2707 * "In case of the halt instruction not inside any conditional
2708 * code block, the value of <JIP> and <UIP> should be the
2709 * same. In case of the halt instruction inside conditional code
2710 * block, the <UIP> should be the end of the program, and the
2711 * <JIP> should be end of the most inner conditional code block."
2712 *
2713 * The uip will have already been set by whoever set up the
2714 * instruction.
2715 */
2716 if (block_end_offset == 0) {
2717 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2718 } else {
2719 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2720 }
2721 assert(brw_inst_uip(devinfo, insn) != 0);
2722 assert(brw_inst_jip(devinfo, insn) != 0);
2723 break;
2724 }
2725 }
2726 }
2727
2728 void brw_ff_sync(struct brw_codegen *p,
2729 struct brw_reg dest,
2730 unsigned msg_reg_nr,
2731 struct brw_reg src0,
2732 bool allocate,
2733 unsigned response_length,
2734 bool eot)
2735 {
2736 const struct gen_device_info *devinfo = p->devinfo;
2737 brw_inst *insn;
2738
2739 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2740
2741 insn = next_insn(p, BRW_OPCODE_SEND);
2742 brw_set_dest(p, insn, dest);
2743 brw_set_src0(p, insn, src0);
2744 brw_set_src1(p, insn, brw_imm_d(0));
2745
2746 if (devinfo->gen < 6)
2747 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2748
2749 brw_set_ff_sync_message(p,
2750 insn,
2751 allocate,
2752 response_length,
2753 eot);
2754 }
2755
2756 /**
2757 * Emit the SEND instruction necessary to generate stream output data on Gen6
2758 * (for transform feedback).
2759 *
2760 * If send_commit_msg is true, this is the last piece of stream output data
2761 * from this thread, so send the data as a committed write. According to the
2762 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2763 *
2764 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2765 * writes are complete by sending the final write as a committed write."
2766 */
2767 void
2768 brw_svb_write(struct brw_codegen *p,
2769 struct brw_reg dest,
2770 unsigned msg_reg_nr,
2771 struct brw_reg src0,
2772 unsigned binding_table_index,
2773 bool send_commit_msg)
2774 {
2775 const struct gen_device_info *devinfo = p->devinfo;
2776 const unsigned target_cache =
2777 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2778 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2779 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2780 brw_inst *insn;
2781
2782 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2783
2784 insn = next_insn(p, BRW_OPCODE_SEND);
2785 brw_set_dest(p, insn, dest);
2786 brw_set_src0(p, insn, src0);
2787 brw_set_src1(p, insn, brw_imm_d(0));
2788 brw_set_dp_write_message(p, insn,
2789 binding_table_index,
2790 0, /* msg_control: ignored */
2791 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2792 target_cache,
2793 1, /* msg_length */
2794 true, /* header_present */
2795 0, /* last_render_target: ignored */
2796 send_commit_msg, /* response_length */
2797 0, /* end_of_thread */
2798 send_commit_msg); /* send_commit_msg */
2799 }
2800
2801 static unsigned
2802 brw_surface_payload_size(struct brw_codegen *p,
2803 unsigned num_channels,
2804 bool has_simd4x2,
2805 bool has_simd16)
2806 {
2807 if (has_simd4x2 &&
2808 brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2809 return 1;
2810 else if (has_simd16 &&
2811 brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2812 return 2 * num_channels;
2813 else
2814 return num_channels;
2815 }
2816
2817 static void
2818 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2819 brw_inst *insn,
2820 unsigned atomic_op,
2821 bool response_expected)
2822 {
2823 const struct gen_device_info *devinfo = p->devinfo;
2824 unsigned msg_control =
2825 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2826 (response_expected ? 1 << 5 : 0); /* Return data expected */
2827
2828 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2829 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2830 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2831 msg_control |= 1 << 4; /* SIMD8 mode */
2832
2833 brw_inst_set_dp_msg_type(devinfo, insn,
2834 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2835 } else {
2836 brw_inst_set_dp_msg_type(devinfo, insn,
2837 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2838 }
2839 } else {
2840 brw_inst_set_dp_msg_type(devinfo, insn,
2841 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2842
2843 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2844 msg_control |= 1 << 4; /* SIMD8 mode */
2845 }
2846
2847 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2848 }
2849
2850 void
2851 brw_untyped_atomic(struct brw_codegen *p,
2852 struct brw_reg dst,
2853 struct brw_reg payload,
2854 struct brw_reg surface,
2855 unsigned atomic_op,
2856 unsigned msg_length,
2857 bool response_expected)
2858 {
2859 const struct gen_device_info *devinfo = p->devinfo;
2860 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2861 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2862 GEN7_SFID_DATAPORT_DATA_CACHE);
2863 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2864 /* Mask out unused components -- This is especially important in Align16
2865 * mode on generations that don't have native support for SIMD4x2 atomics,
2866 * because unused but enabled components will cause the dataport to perform
2867 * additional atomic operations on the addresses that happen to be in the
2868 * uninitialized Y, Z and W coordinates of the payload.
2869 */
2870 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2871 struct brw_inst *insn = brw_send_indirect_surface_message(
2872 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2873 brw_surface_payload_size(p, response_expected,
2874 devinfo->gen >= 8 || devinfo->is_haswell, true),
2875 align1);
2876
2877 brw_set_dp_untyped_atomic_message(
2878 p, insn, atomic_op, response_expected);
2879 }
2880
2881 static void
2882 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2883 struct brw_inst *insn,
2884 unsigned num_channels)
2885 {
2886 const struct gen_device_info *devinfo = p->devinfo;
2887 /* Set mask of 32-bit channels to drop. */
2888 unsigned msg_control = 0xf & (0xf << num_channels);
2889
2890 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2891 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2892 msg_control |= 1 << 4; /* SIMD16 mode */
2893 else
2894 msg_control |= 2 << 4; /* SIMD8 mode */
2895 }
2896
2897 brw_inst_set_dp_msg_type(devinfo, insn,
2898 (devinfo->gen >= 8 || devinfo->is_haswell ?
2899 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2900 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2901 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2902 }
2903
2904 void
2905 brw_untyped_surface_read(struct brw_codegen *p,
2906 struct brw_reg dst,
2907 struct brw_reg payload,
2908 struct brw_reg surface,
2909 unsigned msg_length,
2910 unsigned num_channels)
2911 {
2912 const struct gen_device_info *devinfo = p->devinfo;
2913 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2914 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2915 GEN7_SFID_DATAPORT_DATA_CACHE);
2916 struct brw_inst *insn = brw_send_indirect_surface_message(
2917 p, sfid, dst, payload, surface, msg_length,
2918 brw_surface_payload_size(p, num_channels, true, true),
2919 false);
2920
2921 brw_set_dp_untyped_surface_read_message(
2922 p, insn, num_channels);
2923 }
2924
2925 static void
2926 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2927 struct brw_inst *insn,
2928 unsigned num_channels)
2929 {
2930 const struct gen_device_info *devinfo = p->devinfo;
2931 /* Set mask of 32-bit channels to drop. */
2932 unsigned msg_control = 0xf & (0xf << num_channels);
2933
2934 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2935 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2936 msg_control |= 1 << 4; /* SIMD16 mode */
2937 else
2938 msg_control |= 2 << 4; /* SIMD8 mode */
2939 } else {
2940 if (devinfo->gen >= 8 || devinfo->is_haswell)
2941 msg_control |= 0 << 4; /* SIMD4x2 mode */
2942 else
2943 msg_control |= 2 << 4; /* SIMD8 mode */
2944 }
2945
2946 brw_inst_set_dp_msg_type(devinfo, insn,
2947 devinfo->gen >= 8 || devinfo->is_haswell ?
2948 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
2949 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
2950 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2951 }
2952
2953 void
2954 brw_untyped_surface_write(struct brw_codegen *p,
2955 struct brw_reg payload,
2956 struct brw_reg surface,
2957 unsigned msg_length,
2958 unsigned num_channels)
2959 {
2960 const struct gen_device_info *devinfo = p->devinfo;
2961 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2962 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2963 GEN7_SFID_DATAPORT_DATA_CACHE);
2964 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2965 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2966 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
2967 WRITEMASK_X : WRITEMASK_XYZW;
2968 struct brw_inst *insn = brw_send_indirect_surface_message(
2969 p, sfid, brw_writemask(brw_null_reg(), mask),
2970 payload, surface, msg_length, 0, align1);
2971
2972 brw_set_dp_untyped_surface_write_message(
2973 p, insn, num_channels);
2974 }
2975
2976 static void
2977 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
2978 struct brw_inst *insn,
2979 unsigned atomic_op,
2980 bool response_expected)
2981 {
2982 const struct gen_device_info *devinfo = p->devinfo;
2983 unsigned msg_control =
2984 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2985 (response_expected ? 1 << 5 : 0); /* Return data expected */
2986
2987 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2988 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2989 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
2990 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
2991
2992 brw_inst_set_dp_msg_type(devinfo, insn,
2993 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
2994 } else {
2995 brw_inst_set_dp_msg_type(devinfo, insn,
2996 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
2997 }
2998
2999 } else {
3000 brw_inst_set_dp_msg_type(devinfo, insn,
3001 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3002
3003 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3004 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3005 }
3006
3007 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3008 }
3009
3010 void
3011 brw_typed_atomic(struct brw_codegen *p,
3012 struct brw_reg dst,
3013 struct brw_reg payload,
3014 struct brw_reg surface,
3015 unsigned atomic_op,
3016 unsigned msg_length,
3017 bool response_expected) {
3018 const struct gen_device_info *devinfo = p->devinfo;
3019 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3020 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3021 GEN6_SFID_DATAPORT_RENDER_CACHE);
3022 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3023 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3024 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3025 struct brw_inst *insn = brw_send_indirect_surface_message(
3026 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3027 brw_surface_payload_size(p, response_expected,
3028 devinfo->gen >= 8 || devinfo->is_haswell, false),
3029 true);
3030
3031 brw_set_dp_typed_atomic_message(
3032 p, insn, atomic_op, response_expected);
3033 }
3034
3035 static void
3036 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3037 struct brw_inst *insn,
3038 unsigned num_channels)
3039 {
3040 const struct gen_device_info *devinfo = p->devinfo;
3041 /* Set mask of unused channels. */
3042 unsigned msg_control = 0xf & (0xf << num_channels);
3043
3044 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3045 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3046 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3047 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3048 else
3049 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3050 }
3051
3052 brw_inst_set_dp_msg_type(devinfo, insn,
3053 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3054 } else {
3055 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3056 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3057 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3058 }
3059
3060 brw_inst_set_dp_msg_type(devinfo, insn,
3061 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3062 }
3063
3064 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3065 }
3066
3067 void
3068 brw_typed_surface_read(struct brw_codegen *p,
3069 struct brw_reg dst,
3070 struct brw_reg payload,
3071 struct brw_reg surface,
3072 unsigned msg_length,
3073 unsigned num_channels)
3074 {
3075 const struct gen_device_info *devinfo = p->devinfo;
3076 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3077 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3078 GEN6_SFID_DATAPORT_RENDER_CACHE);
3079 struct brw_inst *insn = brw_send_indirect_surface_message(
3080 p, sfid, dst, payload, surface, msg_length,
3081 brw_surface_payload_size(p, num_channels,
3082 devinfo->gen >= 8 || devinfo->is_haswell, false),
3083 true);
3084
3085 brw_set_dp_typed_surface_read_message(
3086 p, insn, num_channels);
3087 }
3088
3089 static void
3090 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3091 struct brw_inst *insn,
3092 unsigned num_channels)
3093 {
3094 const struct gen_device_info *devinfo = p->devinfo;
3095 /* Set mask of unused channels. */
3096 unsigned msg_control = 0xf & (0xf << num_channels);
3097
3098 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3099 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3100 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3101 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3102 else
3103 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3104 }
3105
3106 brw_inst_set_dp_msg_type(devinfo, insn,
3107 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3108
3109 } else {
3110 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3111 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3112 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3113 }
3114
3115 brw_inst_set_dp_msg_type(devinfo, insn,
3116 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3117 }
3118
3119 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3120 }
3121
3122 void
3123 brw_typed_surface_write(struct brw_codegen *p,
3124 struct brw_reg payload,
3125 struct brw_reg surface,
3126 unsigned msg_length,
3127 unsigned num_channels)
3128 {
3129 const struct gen_device_info *devinfo = p->devinfo;
3130 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3131 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3132 GEN6_SFID_DATAPORT_RENDER_CACHE);
3133 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3134 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3135 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3136 WRITEMASK_X : WRITEMASK_XYZW);
3137 struct brw_inst *insn = brw_send_indirect_surface_message(
3138 p, sfid, brw_writemask(brw_null_reg(), mask),
3139 payload, surface, msg_length, 0, true);
3140
3141 brw_set_dp_typed_surface_write_message(
3142 p, insn, num_channels);
3143 }
3144
3145 static void
3146 brw_set_memory_fence_message(struct brw_codegen *p,
3147 struct brw_inst *insn,
3148 enum brw_message_target sfid,
3149 bool commit_enable)
3150 {
3151 const struct gen_device_info *devinfo = p->devinfo;
3152
3153 brw_set_message_descriptor(p, insn, sfid,
3154 1 /* message length */,
3155 (commit_enable ? 1 : 0) /* response length */,
3156 true /* header present */,
3157 false);
3158
3159 switch (sfid) {
3160 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3161 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3162 break;
3163 case GEN7_SFID_DATAPORT_DATA_CACHE:
3164 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3165 break;
3166 default:
3167 unreachable("Not reached");
3168 }
3169
3170 if (commit_enable)
3171 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3172 }
3173
3174 void
3175 brw_memory_fence(struct brw_codegen *p,
3176 struct brw_reg dst)
3177 {
3178 const struct gen_device_info *devinfo = p->devinfo;
3179 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3180 struct brw_inst *insn;
3181
3182 brw_push_insn_state(p);
3183 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3184 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3185 dst = vec1(dst);
3186
3187 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3188 * message doesn't write anything back.
3189 */
3190 insn = next_insn(p, BRW_OPCODE_SEND);
3191 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3192 brw_set_dest(p, insn, dst);
3193 brw_set_src0(p, insn, dst);
3194 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3195 commit_enable);
3196
3197 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3198 /* IVB does typed surface access through the render cache, so we need to
3199 * flush it too. Use a different register so both flushes can be
3200 * pipelined by the hardware.
3201 */
3202 insn = next_insn(p, BRW_OPCODE_SEND);
3203 brw_set_dest(p, insn, offset(dst, 1));
3204 brw_set_src0(p, insn, offset(dst, 1));
3205 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3206 commit_enable);
3207
3208 /* Now write the response of the second message into the response of the
3209 * first to trigger a pipeline stall -- This way future render and data
3210 * cache messages will be properly ordered with respect to past data and
3211 * render cache messages.
3212 */
3213 brw_MOV(p, dst, offset(dst, 1));
3214 }
3215
3216 brw_pop_insn_state(p);
3217 }
3218
3219 void
3220 brw_pixel_interpolator_query(struct brw_codegen *p,
3221 struct brw_reg dest,
3222 struct brw_reg mrf,
3223 bool noperspective,
3224 unsigned mode,
3225 struct brw_reg data,
3226 unsigned msg_length,
3227 unsigned response_length)
3228 {
3229 const struct gen_device_info *devinfo = p->devinfo;
3230 struct brw_inst *insn;
3231 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3232
3233 /* brw_send_indirect_message will automatically use a direct send message
3234 * if data is actually immediate.
3235 */
3236 insn = brw_send_indirect_message(p,
3237 GEN7_SFID_PIXEL_INTERPOLATOR,
3238 dest,
3239 mrf,
3240 vec1(data));
3241 brw_inst_set_mlen(devinfo, insn, msg_length);
3242 brw_inst_set_rlen(devinfo, insn, response_length);
3243
3244 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3245 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3246 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3247 brw_inst_set_pi_message_type(devinfo, insn, mode);
3248 }
3249
3250 void
3251 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3252 struct brw_reg mask)
3253 {
3254 const struct gen_device_info *devinfo = p->devinfo;
3255 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3256 const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3257 brw_inst *inst;
3258
3259 assert(devinfo->gen >= 7);
3260 assert(mask.type == BRW_REGISTER_TYPE_UD);
3261
3262 brw_push_insn_state(p);
3263
3264 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3265 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3266
3267 if (devinfo->gen >= 8) {
3268 /* Getting the first active channel index is easy on Gen8: Just find
3269 * the first bit set in the execution mask. The register exists on
3270 * HSW already but it reads back as all ones when the current
3271 * instruction has execution masking disabled, so it's kind of
3272 * useless.
3273 */
3274 struct brw_reg exec_mask =
3275 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3276
3277 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3278 /* Unfortunately, ce0 does not take into account the thread
3279 * dispatch mask, which may be a problem in cases where it's not
3280 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3281 * some n). Combine ce0 with the given dispatch (or vector) mask
3282 * to mask off those channels which were never dispatched by the
3283 * hardware.
3284 */
3285 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3286 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3287 exec_mask = vec1(dst);
3288 }
3289
3290 /* Quarter control has the effect of magically shifting the value of
3291 * ce0 so you'll get the first active channel relative to the
3292 * specified quarter control as result.
3293 */
3294 inst = brw_FBL(p, vec1(dst), exec_mask);
3295 } else {
3296 const struct brw_reg flag = brw_flag_reg(1, 0);
3297
3298 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3299
3300 /* Run enough instructions returning zero with execution masking and
3301 * a conditional modifier enabled in order to get the full execution
3302 * mask in f1.0. We could use a single 32-wide move here if it
3303 * weren't because of the hardware bug that causes channel enables to
3304 * be applied incorrectly to the second half of 32-wide instructions
3305 * on Gen7.
3306 */
3307 const unsigned lower_size = MIN2(16, exec_size);
3308 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3309 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3310 brw_imm_uw(0));
3311 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3312 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3313 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3314 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3315 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3316 }
3317
3318 /* Find the first bit set in the exec_size-wide portion of the flag
3319 * register that was updated by the last sequence of MOV
3320 * instructions.
3321 */
3322 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3323 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3324 }
3325 } else {
3326 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3327
3328 if (devinfo->gen >= 8 &&
3329 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3330 /* In SIMD4x2 mode the first active channel index is just the
3331 * negation of the first bit of the mask register. Note that ce0
3332 * doesn't take into account the dispatch mask, so the Gen7 path
3333 * should be used instead unless you have the guarantee that the
3334 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3335 * for some n).
3336 */
3337 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3338 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3339 brw_imm_ud(1));
3340
3341 } else {
3342 /* Overwrite the destination without and with execution masking to
3343 * find out which of the channels is active.
3344 */
3345 brw_push_insn_state(p);
3346 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3347 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3348 brw_imm_ud(1));
3349
3350 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3351 brw_imm_ud(0));
3352 brw_pop_insn_state(p);
3353 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3354 }
3355 }
3356
3357 brw_pop_insn_state(p);
3358 }
3359
3360 void
3361 brw_broadcast(struct brw_codegen *p,
3362 struct brw_reg dst,
3363 struct brw_reg src,
3364 struct brw_reg idx)
3365 {
3366 const struct gen_device_info *devinfo = p->devinfo;
3367 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3368 brw_inst *inst;
3369
3370 brw_push_insn_state(p);
3371 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3372 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3373
3374 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3375 src.address_mode == BRW_ADDRESS_DIRECT);
3376
3377 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3378 idx.file == BRW_IMMEDIATE_VALUE) {
3379 /* Trivial, the source is already uniform or the index is a constant.
3380 * We will typically not get here if the optimizer is doing its job, but
3381 * asserting would be mean.
3382 */
3383 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3384 brw_MOV(p, dst,
3385 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3386 stride(suboffset(src, 4 * i), 0, 4, 1)));
3387 } else {
3388 if (align1) {
3389 const struct brw_reg addr =
3390 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3391 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3392 /* Limit in bytes of the signed indirect addressing immediate. */
3393 const unsigned limit = 512;
3394
3395 brw_push_insn_state(p);
3396 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3397 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3398
3399 /* Take into account the component size and horizontal stride. */
3400 assert(src.vstride == src.hstride + src.width);
3401 brw_SHL(p, addr, vec1(idx),
3402 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3403 src.hstride - 1));
3404
3405 /* We can only address up to limit bytes using the indirect
3406 * addressing immediate, account for the difference if the source
3407 * register is above this limit.
3408 */
3409 if (offset >= limit)
3410 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3411
3412 brw_pop_insn_state(p);
3413
3414 /* Use indirect addressing to fetch the specified component. */
3415 brw_MOV(p, dst,
3416 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3417 src.type));
3418 } else {
3419 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3420 * to all bits of a flag register,
3421 */
3422 inst = brw_MOV(p,
3423 brw_null_reg(),
3424 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3425 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3426 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3427 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3428
3429 /* and use predicated SEL to pick the right channel. */
3430 inst = brw_SEL(p, dst,
3431 stride(suboffset(src, 4), 4, 4, 1),
3432 stride(src, 4, 4, 1));
3433 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3434 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3435 }
3436 }
3437
3438 brw_pop_insn_state(p);
3439 }
3440
3441 /**
3442 * This instruction is generated as a single-channel align1 instruction by
3443 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3444 *
3445 * We can't use the typed atomic op in the FS because that has the execution
3446 * mask ANDed with the pixel mask, but we just want to write the one dword for
3447 * all the pixels.
3448 *
3449 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3450 * one u32. So we use the same untyped atomic write message as the pixel
3451 * shader.
3452 *
3453 * The untyped atomic operation requires a BUFFER surface type with RAW
3454 * format, and is only accessible through the legacy DATA_CACHE dataport
3455 * messages.
3456 */
3457 void brw_shader_time_add(struct brw_codegen *p,
3458 struct brw_reg payload,
3459 uint32_t surf_index)
3460 {
3461 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3462 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3463 GEN7_SFID_DATAPORT_DATA_CACHE);
3464 assert(p->devinfo->gen >= 7);
3465
3466 brw_push_insn_state(p);
3467 brw_set_default_access_mode(p, BRW_ALIGN_1);
3468 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3469 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3470 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3471
3472 /* We use brw_vec1_reg and unmasked because we want to increment the given
3473 * offset only once.
3474 */
3475 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3476 BRW_ARF_NULL, 0));
3477 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3478 payload.nr, 0));
3479 brw_set_src1(p, send, brw_imm_ud(0));
3480 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3481 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3482 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3483
3484 brw_pop_insn_state(p);
3485 }
3486
3487
3488 /**
3489 * Emit the SEND message for a barrier
3490 */
3491 void
3492 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3493 {
3494 const struct gen_device_info *devinfo = p->devinfo;
3495 struct brw_inst *inst;
3496
3497 assert(devinfo->gen >= 7);
3498
3499 brw_push_insn_state(p);
3500 brw_set_default_access_mode(p, BRW_ALIGN_1);
3501 inst = next_insn(p, BRW_OPCODE_SEND);
3502 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3503 brw_set_src0(p, inst, src);
3504 brw_set_src1(p, inst, brw_null_reg());
3505
3506 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3507 1 /* msg_length */,
3508 0 /* response_length */,
3509 false /* header_present */,
3510 false /* end_of_thread */);
3511
3512 brw_inst_set_gateway_notify(devinfo, inst, 1);
3513 brw_inst_set_gateway_subfuncid(devinfo, inst,
3514 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3515
3516 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3517 brw_pop_insn_state(p);
3518 }
3519
3520
3521 /**
3522 * Emit the wait instruction for a barrier
3523 */
3524 void
3525 brw_WAIT(struct brw_codegen *p)
3526 {
3527 const struct gen_device_info *devinfo = p->devinfo;
3528 struct brw_inst *insn;
3529
3530 struct brw_reg src = brw_notification_reg();
3531
3532 insn = next_insn(p, BRW_OPCODE_WAIT);
3533 brw_set_dest(p, insn, src);
3534 brw_set_src0(p, insn, src);
3535 brw_set_src1(p, insn, brw_null_reg());
3536
3537 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3538 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3539 }