6d9e7e081e144df93c2baa7984e552146bcbceca
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 void
88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90 const struct gen_device_info *devinfo = p->devinfo;
91
92 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
95 assert(dest.nr < 128);
96
97 gen7_convert_mrf_to_grf(p, &dest);
98
99 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
100 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
101
102 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
103 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
104
105 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
106 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
107 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
108 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
109 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
110 } else {
111 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
112 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
113 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
114 dest.file == BRW_MESSAGE_REGISTER_FILE) {
115 assert(dest.writemask != 0);
116 }
117 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
118 * Although Dst.HorzStride is a don't care for Align16, HW needs
119 * this to be programmed as "01".
120 */
121 brw_inst_set_dst_hstride(devinfo, inst, 1);
122 }
123 } else {
124 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
125
126 /* These are different sizes in align1 vs align16:
127 */
128 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
129 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
130 dest.indirect_offset);
131 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
134 } else {
135 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
136 dest.indirect_offset);
137 /* even ignored in da16, still need to set as '01' */
138 brw_inst_set_dst_hstride(devinfo, inst, 1);
139 }
140 }
141
142 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
143 * or 16 (SIMD16), as that's normally correct. However, when dealing with
144 * small registers, it can be useful for us to automatically reduce it to
145 * match the register size.
146 */
147 if (p->automatic_exec_sizes) {
148 /*
149 * In platforms that support fp64 we can emit instructions with a width
150 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
151 * these cases we need to make sure that these instructions have their
152 * exec sizes set properly when they are emitted and we can't rely on
153 * this code to fix it.
154 */
155 bool fix_exec_size;
156 if (devinfo->gen >= 6)
157 fix_exec_size = dest.width < BRW_EXECUTE_4;
158 else
159 fix_exec_size = dest.width < BRW_EXECUTE_8;
160
161 if (fix_exec_size)
162 brw_inst_set_exec_size(devinfo, inst, dest.width);
163 }
164 }
165
166 void
167 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
168 {
169 const struct gen_device_info *devinfo = p->devinfo;
170
171 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
172 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
173 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
174 assert(reg.nr < 128);
175
176 gen7_convert_mrf_to_grf(p, &reg);
177
178 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
179 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
180 /* Any source modifiers or regions will be ignored, since this just
181 * identifies the MRF/GRF to start reading the message contents from.
182 * Check for some likely failures.
183 */
184 assert(!reg.negate);
185 assert(!reg.abs);
186 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
187 }
188
189 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
190 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
191 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
192 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
193
194 if (reg.file == BRW_IMMEDIATE_VALUE) {
195 if (reg.type == BRW_REGISTER_TYPE_DF ||
196 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
197 brw_inst_set_imm_df(devinfo, inst, reg.df);
198 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
199 reg.type == BRW_REGISTER_TYPE_Q)
200 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
201 else
202 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
203
204 if (type_sz(reg.type) < 8) {
205 brw_inst_set_src1_reg_file(devinfo, inst,
206 BRW_ARCHITECTURE_REGISTER_FILE);
207 brw_inst_set_src1_reg_hw_type(devinfo, inst,
208 brw_inst_src0_reg_hw_type(devinfo, inst));
209 }
210 } else {
211 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
212 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
213 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
214 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
215 } else {
216 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
217 }
218 } else {
219 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
220
221 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
222 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
223 } else {
224 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
225 }
226 }
227
228 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
229 if (reg.width == BRW_WIDTH_1 &&
230 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
231 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
232 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
233 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
234 } else {
235 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
236 brw_inst_set_src0_width(devinfo, inst, reg.width);
237 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
238 }
239 } else {
240 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
241 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
242 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
243 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
244 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
245 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
246 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
247 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
248
249 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
250 /* This is an oddity of the fact we're using the same
251 * descriptions for registers in align_16 as align_1:
252 */
253 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
254 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
255 reg.type == BRW_REGISTER_TYPE_DF &&
256 reg.vstride == BRW_VERTICAL_STRIDE_2) {
257 /* From SNB PRM:
258 *
259 * "For Align16 access mode, only encodings of 0000 and 0011
260 * are allowed. Other codes are reserved."
261 *
262 * Presumably the DevSNB behavior applies to IVB as well.
263 */
264 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
265 } else {
266 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
267 }
268 }
269 }
270 }
271
272
273 void
274 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
275 {
276 const struct gen_device_info *devinfo = p->devinfo;
277
278 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
279 assert(reg.nr < 128);
280
281 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
282 *
283 * "Accumulator registers may be accessed explicitly as src0
284 * operands only."
285 */
286 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
287 reg.nr != BRW_ARF_ACCUMULATOR);
288
289 gen7_convert_mrf_to_grf(p, &reg);
290 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
291
292 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
293 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
294 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
295
296 /* Only src1 can be immediate in two-argument instructions.
297 */
298 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
299
300 if (reg.file == BRW_IMMEDIATE_VALUE) {
301 /* two-argument instructions can only use 32-bit immediates */
302 assert(type_sz(reg.type) < 8);
303 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
304 } else {
305 /* This is a hardware restriction, which may or may not be lifted
306 * in the future:
307 */
308 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
309 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
310
311 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
312 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
313 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
314 } else {
315 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
316 }
317
318 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
319 if (reg.width == BRW_WIDTH_1 &&
320 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
321 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
322 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
323 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
324 } else {
325 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
326 brw_inst_set_src1_width(devinfo, inst, reg.width);
327 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
328 }
329 } else {
330 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
331 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
332 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
333 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
334 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
335 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
336 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
337 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
338
339 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
340 /* This is an oddity of the fact we're using the same
341 * descriptions for registers in align_16 as align_1:
342 */
343 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
344 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
345 reg.type == BRW_REGISTER_TYPE_DF &&
346 reg.vstride == BRW_VERTICAL_STRIDE_2) {
347 /* From SNB PRM:
348 *
349 * "For Align16 access mode, only encodings of 0000 and 0011
350 * are allowed. Other codes are reserved."
351 *
352 * Presumably the DevSNB behavior applies to IVB as well.
353 */
354 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
355 } else {
356 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
357 }
358 }
359 }
360 }
361
362 /**
363 * Specify the descriptor and extended descriptor immediate for a SEND(C)
364 * message instruction.
365 */
366 void
367 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
368 unsigned desc, unsigned ex_desc)
369 {
370 const struct gen_device_info *devinfo = p->devinfo;
371 brw_inst_set_src1_file_type(devinfo, inst,
372 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_D);
373 brw_inst_set_send_desc(devinfo, inst, desc);
374 if (devinfo->gen >= 9 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
375 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))
376 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
377 }
378
379 static void brw_set_math_message( struct brw_codegen *p,
380 brw_inst *inst,
381 unsigned function,
382 unsigned integer_type,
383 bool low_precision,
384 unsigned dataType )
385 {
386 const struct gen_device_info *devinfo = p->devinfo;
387 unsigned msg_length;
388 unsigned response_length;
389
390 /* Infer message length from the function */
391 switch (function) {
392 case BRW_MATH_FUNCTION_POW:
393 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
394 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
395 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
396 msg_length = 2;
397 break;
398 default:
399 msg_length = 1;
400 break;
401 }
402
403 /* Infer response length from the function */
404 switch (function) {
405 case BRW_MATH_FUNCTION_SINCOS:
406 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
407 response_length = 2;
408 break;
409 default:
410 response_length = 1;
411 break;
412 }
413
414 brw_set_desc(p, inst, brw_message_desc(
415 devinfo, msg_length, response_length, false));
416
417 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
418 brw_inst_set_math_msg_function(devinfo, inst, function);
419 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
420 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
421 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
422 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
423 brw_inst_set_saturate(devinfo, inst, 0);
424 }
425
426
427 static void brw_set_ff_sync_message(struct brw_codegen *p,
428 brw_inst *insn,
429 bool allocate,
430 unsigned response_length,
431 bool end_of_thread)
432 {
433 const struct gen_device_info *devinfo = p->devinfo;
434
435 brw_set_desc(p, insn, brw_message_desc(
436 devinfo, 1, response_length, true));
437
438 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
439 brw_inst_set_eot(devinfo, insn, end_of_thread);
440 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
441 brw_inst_set_urb_allocate(devinfo, insn, allocate);
442 /* The following fields are not used by FF_SYNC: */
443 brw_inst_set_urb_global_offset(devinfo, insn, 0);
444 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
445 brw_inst_set_urb_used(devinfo, insn, 0);
446 brw_inst_set_urb_complete(devinfo, insn, 0);
447 }
448
449 static void brw_set_urb_message( struct brw_codegen *p,
450 brw_inst *insn,
451 enum brw_urb_write_flags flags,
452 unsigned msg_length,
453 unsigned response_length,
454 unsigned offset,
455 unsigned swizzle_control )
456 {
457 const struct gen_device_info *devinfo = p->devinfo;
458
459 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
460 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
461 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
462
463 brw_set_desc(p, insn, brw_message_desc(
464 devinfo, msg_length, response_length, true));
465
466 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
467 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
468
469 if (flags & BRW_URB_WRITE_OWORD) {
470 assert(msg_length == 2); /* header + one OWORD of data */
471 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
472 } else {
473 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
474 }
475
476 brw_inst_set_urb_global_offset(devinfo, insn, offset);
477 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
478
479 if (devinfo->gen < 8) {
480 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
481 }
482
483 if (devinfo->gen < 7) {
484 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
485 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
486 } else {
487 brw_inst_set_urb_per_slot_offset(devinfo, insn,
488 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
489 }
490 }
491
492 void
493 brw_set_dp_write_message(struct brw_codegen *p,
494 brw_inst *insn,
495 unsigned binding_table_index,
496 unsigned msg_control,
497 unsigned msg_type,
498 unsigned target_cache,
499 unsigned msg_length,
500 bool header_present,
501 unsigned last_render_target,
502 unsigned response_length,
503 unsigned end_of_thread,
504 unsigned send_commit_msg)
505 {
506 const struct gen_device_info *devinfo = p->devinfo;
507 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
508 BRW_SFID_DATAPORT_WRITE);
509
510 brw_set_desc(p, insn, brw_message_desc(
511 devinfo, msg_length, response_length, header_present));
512
513 brw_inst_set_sfid(devinfo, insn, sfid);
514 brw_inst_set_eot(devinfo, insn, !!end_of_thread);
515 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
516 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
517 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
518 brw_inst_set_rt_last(devinfo, insn, last_render_target);
519 if (devinfo->gen < 7) {
520 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
521 }
522
523 if (devinfo->gen >= 11)
524 brw_inst_set_null_rt(devinfo, insn, false);
525 }
526
527 void
528 brw_set_dp_read_message(struct brw_codegen *p,
529 brw_inst *insn,
530 unsigned binding_table_index,
531 unsigned msg_control,
532 unsigned msg_type,
533 unsigned target_cache,
534 unsigned msg_length,
535 bool header_present,
536 unsigned response_length)
537 {
538 const struct gen_device_info *devinfo = p->devinfo;
539 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
540 BRW_SFID_DATAPORT_READ);
541
542 brw_set_desc(p, insn, brw_message_desc(
543 devinfo, msg_length, response_length, header_present));
544
545 const unsigned opcode = brw_inst_opcode(devinfo, insn);
546 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC)
547 brw_inst_set_sfid(devinfo, insn, sfid);
548 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
549 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
550 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
551 if (devinfo->gen < 6)
552 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
553 }
554
555 static void
556 gen7_set_dp_scratch_message(struct brw_codegen *p,
557 brw_inst *inst,
558 bool write,
559 bool dword,
560 bool invalidate_after_read,
561 unsigned num_regs,
562 unsigned addr_offset,
563 unsigned mlen,
564 unsigned rlen,
565 bool header_present)
566 {
567 const struct gen_device_info *devinfo = p->devinfo;
568 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
569 (devinfo->gen >= 8 && num_regs == 8));
570 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
571 num_regs - 1);
572
573 brw_set_desc(p, inst, brw_message_desc(
574 devinfo, mlen, rlen, header_present));
575
576 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
577 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
578 brw_inst_set_scratch_read_write(devinfo, inst, write);
579 brw_inst_set_scratch_type(devinfo, inst, dword);
580 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
581 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
582 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
583 }
584
585 static void
586 brw_inst_set_state(const struct gen_device_info *devinfo,
587 brw_inst *insn,
588 const struct brw_insn_state *state)
589 {
590 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
591 brw_inst_set_group(devinfo, insn, state->group);
592 brw_inst_set_compression(devinfo, insn, state->compressed);
593 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
594 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
595 brw_inst_set_saturate(devinfo, insn, state->saturate);
596 brw_inst_set_pred_control(devinfo, insn, state->predicate);
597 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
598
599 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
600 state->access_mode == BRW_ALIGN_16) {
601 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
602 if (devinfo->gen >= 7)
603 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
604 } else {
605 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
606 if (devinfo->gen >= 7)
607 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
608 }
609
610 if (devinfo->gen >= 6)
611 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
612 }
613
614 #define next_insn brw_next_insn
615 brw_inst *
616 brw_next_insn(struct brw_codegen *p, unsigned opcode)
617 {
618 const struct gen_device_info *devinfo = p->devinfo;
619 brw_inst *insn;
620
621 if (p->nr_insn + 1 > p->store_size) {
622 p->store_size <<= 1;
623 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
624 }
625
626 p->next_insn_offset += 16;
627 insn = &p->store[p->nr_insn++];
628
629 memset(insn, 0, sizeof(*insn));
630 brw_inst_set_opcode(devinfo, insn, opcode);
631
632 /* Apply the default instruction state */
633 brw_inst_set_state(devinfo, insn, p->current);
634
635 return insn;
636 }
637
638 static brw_inst *
639 brw_alu1(struct brw_codegen *p, unsigned opcode,
640 struct brw_reg dest, struct brw_reg src)
641 {
642 brw_inst *insn = next_insn(p, opcode);
643 brw_set_dest(p, insn, dest);
644 brw_set_src0(p, insn, src);
645 return insn;
646 }
647
648 static brw_inst *
649 brw_alu2(struct brw_codegen *p, unsigned opcode,
650 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
651 {
652 /* 64-bit immediates are only supported on 1-src instructions */
653 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
654 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
655
656 brw_inst *insn = next_insn(p, opcode);
657 brw_set_dest(p, insn, dest);
658 brw_set_src0(p, insn, src0);
659 brw_set_src1(p, insn, src1);
660 return insn;
661 }
662
663 static int
664 get_3src_subreg_nr(struct brw_reg reg)
665 {
666 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
667 * use 32-bit units (components 0..7). Since they only support F/D/UD
668 * types, this doesn't lose any flexibility, but uses fewer bits.
669 */
670 return reg.subnr / 4;
671 }
672
673 static enum gen10_align1_3src_vertical_stride
674 to_3src_align1_vstride(enum brw_vertical_stride vstride)
675 {
676 switch (vstride) {
677 case BRW_VERTICAL_STRIDE_0:
678 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
679 case BRW_VERTICAL_STRIDE_2:
680 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
681 case BRW_VERTICAL_STRIDE_4:
682 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
683 case BRW_VERTICAL_STRIDE_8:
684 case BRW_VERTICAL_STRIDE_16:
685 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
686 default:
687 unreachable("invalid vstride");
688 }
689 }
690
691
692 static enum gen10_align1_3src_src_horizontal_stride
693 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
694 {
695 switch (hstride) {
696 case BRW_HORIZONTAL_STRIDE_0:
697 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
698 case BRW_HORIZONTAL_STRIDE_1:
699 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
700 case BRW_HORIZONTAL_STRIDE_2:
701 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
702 case BRW_HORIZONTAL_STRIDE_4:
703 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
704 default:
705 unreachable("invalid hstride");
706 }
707 }
708
709 static brw_inst *
710 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
711 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
712 {
713 const struct gen_device_info *devinfo = p->devinfo;
714 brw_inst *inst = next_insn(p, opcode);
715
716 gen7_convert_mrf_to_grf(p, &dest);
717
718 assert(dest.nr < 128);
719 assert(src0.nr < 128);
720 assert(src1.nr < 128);
721 assert(src2.nr < 128);
722 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
723 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
724 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
725 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
726
727 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
728 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
729 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
730
731 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
732 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
733 BRW_ALIGN1_3SRC_ACCUMULATOR);
734 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
735 } else {
736 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
737 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
738 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
739 }
740 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
741
742 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
743
744 if (brw_reg_type_is_floating_point(dest.type)) {
745 brw_inst_set_3src_a1_exec_type(devinfo, inst,
746 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
747 } else {
748 brw_inst_set_3src_a1_exec_type(devinfo, inst,
749 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
750 }
751
752 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
753 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
754 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
755 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
756
757 brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
758 to_3src_align1_vstride(src0.vstride));
759 brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
760 to_3src_align1_vstride(src1.vstride));
761 /* no vstride on src2 */
762
763 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
764 to_3src_align1_hstride(src0.hstride));
765 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
766 to_3src_align1_hstride(src1.hstride));
767 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
768 to_3src_align1_hstride(src2.hstride));
769
770 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
771 if (src0.type == BRW_REGISTER_TYPE_NF) {
772 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
773 } else {
774 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
775 }
776 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
777 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
778
779 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
780 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
781 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
782 } else {
783 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
784 }
785 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
786 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
787
788 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
789 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
790 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
791 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
792
793 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
794 src0.file == BRW_IMMEDIATE_VALUE ||
795 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
796 src0.type == BRW_REGISTER_TYPE_NF));
797 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
798 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
799 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
800 src2.file == BRW_IMMEDIATE_VALUE);
801
802 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
803 src0.file == BRW_GENERAL_REGISTER_FILE ?
804 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
805 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
806 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
807 src1.file == BRW_GENERAL_REGISTER_FILE ?
808 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
809 BRW_ALIGN1_3SRC_ACCUMULATOR);
810 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
811 src2.file == BRW_GENERAL_REGISTER_FILE ?
812 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
813 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
814 } else {
815 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
816 dest.file == BRW_MESSAGE_REGISTER_FILE);
817 assert(dest.type == BRW_REGISTER_TYPE_F ||
818 dest.type == BRW_REGISTER_TYPE_DF ||
819 dest.type == BRW_REGISTER_TYPE_D ||
820 dest.type == BRW_REGISTER_TYPE_UD);
821 if (devinfo->gen == 6) {
822 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
823 dest.file == BRW_MESSAGE_REGISTER_FILE);
824 }
825 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
826 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
827 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
828
829 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
830 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
831 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
832 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
833 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
834 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
835 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
836 src0.vstride == BRW_VERTICAL_STRIDE_0);
837
838 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
839 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
840 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
841 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
842 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
843 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
844 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
845 src1.vstride == BRW_VERTICAL_STRIDE_0);
846
847 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
848 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
849 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
850 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
851 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
852 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
853 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
854 src2.vstride == BRW_VERTICAL_STRIDE_0);
855
856 if (devinfo->gen >= 7) {
857 /* Set both the source and destination types based on dest.type,
858 * ignoring the source register types. The MAD and LRP emitters ensure
859 * that all four types are float. The BFE and BFI2 emitters, however,
860 * may send us mixed D and UD types and want us to ignore that and use
861 * the destination type.
862 */
863 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
864 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
865 }
866 }
867
868 return inst;
869 }
870
871
872 /***********************************************************************
873 * Convenience routines.
874 */
875 #define ALU1(OP) \
876 brw_inst *brw_##OP(struct brw_codegen *p, \
877 struct brw_reg dest, \
878 struct brw_reg src0) \
879 { \
880 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
881 }
882
883 #define ALU2(OP) \
884 brw_inst *brw_##OP(struct brw_codegen *p, \
885 struct brw_reg dest, \
886 struct brw_reg src0, \
887 struct brw_reg src1) \
888 { \
889 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
890 }
891
892 #define ALU3(OP) \
893 brw_inst *brw_##OP(struct brw_codegen *p, \
894 struct brw_reg dest, \
895 struct brw_reg src0, \
896 struct brw_reg src1, \
897 struct brw_reg src2) \
898 { \
899 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
900 }
901
902 #define ALU3F(OP) \
903 brw_inst *brw_##OP(struct brw_codegen *p, \
904 struct brw_reg dest, \
905 struct brw_reg src0, \
906 struct brw_reg src1, \
907 struct brw_reg src2) \
908 { \
909 assert(dest.type == BRW_REGISTER_TYPE_F || \
910 dest.type == BRW_REGISTER_TYPE_DF); \
911 if (dest.type == BRW_REGISTER_TYPE_F) { \
912 assert(src0.type == BRW_REGISTER_TYPE_F); \
913 assert(src1.type == BRW_REGISTER_TYPE_F); \
914 assert(src2.type == BRW_REGISTER_TYPE_F); \
915 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
916 assert(src0.type == BRW_REGISTER_TYPE_DF); \
917 assert(src1.type == BRW_REGISTER_TYPE_DF); \
918 assert(src2.type == BRW_REGISTER_TYPE_DF); \
919 } \
920 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
921 }
922
923 /* Rounding operations (other than RNDD) require two instructions - the first
924 * stores a rounded value (possibly the wrong way) in the dest register, but
925 * also sets a per-channel "increment bit" in the flag register. A predicated
926 * add of 1.0 fixes dest to contain the desired result.
927 *
928 * Sandybridge and later appear to round correctly without an ADD.
929 */
930 #define ROUND(OP) \
931 void brw_##OP(struct brw_codegen *p, \
932 struct brw_reg dest, \
933 struct brw_reg src) \
934 { \
935 const struct gen_device_info *devinfo = p->devinfo; \
936 brw_inst *rnd, *add; \
937 rnd = next_insn(p, BRW_OPCODE_##OP); \
938 brw_set_dest(p, rnd, dest); \
939 brw_set_src0(p, rnd, src); \
940 \
941 if (devinfo->gen < 6) { \
942 /* turn on round-increments */ \
943 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
944 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
945 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
946 } \
947 }
948
949
950 ALU2(SEL)
951 ALU1(NOT)
952 ALU2(AND)
953 ALU2(OR)
954 ALU2(XOR)
955 ALU2(SHR)
956 ALU2(SHL)
957 ALU1(DIM)
958 ALU2(ASR)
959 ALU3(CSEL)
960 ALU1(FRC)
961 ALU1(RNDD)
962 ALU2(MAC)
963 ALU2(MACH)
964 ALU1(LZD)
965 ALU2(DP4)
966 ALU2(DPH)
967 ALU2(DP3)
968 ALU2(DP2)
969 ALU3(MAD)
970 ALU3F(LRP)
971 ALU1(BFREV)
972 ALU3(BFE)
973 ALU2(BFI1)
974 ALU3(BFI2)
975 ALU1(FBH)
976 ALU1(FBL)
977 ALU1(CBIT)
978 ALU2(ADDC)
979 ALU2(SUBB)
980
981 ROUND(RNDZ)
982 ROUND(RNDE)
983
984 brw_inst *
985 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
986 {
987 const struct gen_device_info *devinfo = p->devinfo;
988
989 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
990 * To avoid the problems that causes, we use a <1,2,0> source region to read
991 * each element twice.
992 */
993 if (devinfo->gen == 7 && !devinfo->is_haswell &&
994 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
995 dest.type == BRW_REGISTER_TYPE_DF &&
996 (src0.type == BRW_REGISTER_TYPE_F ||
997 src0.type == BRW_REGISTER_TYPE_D ||
998 src0.type == BRW_REGISTER_TYPE_UD) &&
999 !has_scalar_region(src0)) {
1000 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
1001 src0.width == BRW_WIDTH_4 &&
1002 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1003
1004 src0.vstride = BRW_VERTICAL_STRIDE_1;
1005 src0.width = BRW_WIDTH_2;
1006 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1007 }
1008
1009 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1010 }
1011
1012 brw_inst *
1013 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1014 struct brw_reg src0, struct brw_reg src1)
1015 {
1016 /* 6.2.2: add */
1017 if (src0.type == BRW_REGISTER_TYPE_F ||
1018 (src0.file == BRW_IMMEDIATE_VALUE &&
1019 src0.type == BRW_REGISTER_TYPE_VF)) {
1020 assert(src1.type != BRW_REGISTER_TYPE_UD);
1021 assert(src1.type != BRW_REGISTER_TYPE_D);
1022 }
1023
1024 if (src1.type == BRW_REGISTER_TYPE_F ||
1025 (src1.file == BRW_IMMEDIATE_VALUE &&
1026 src1.type == BRW_REGISTER_TYPE_VF)) {
1027 assert(src0.type != BRW_REGISTER_TYPE_UD);
1028 assert(src0.type != BRW_REGISTER_TYPE_D);
1029 }
1030
1031 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1032 }
1033
1034 brw_inst *
1035 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1036 struct brw_reg src0, struct brw_reg src1)
1037 {
1038 assert(dest.type == src0.type);
1039 assert(src0.type == src1.type);
1040 switch (src0.type) {
1041 case BRW_REGISTER_TYPE_B:
1042 case BRW_REGISTER_TYPE_UB:
1043 case BRW_REGISTER_TYPE_W:
1044 case BRW_REGISTER_TYPE_UW:
1045 case BRW_REGISTER_TYPE_D:
1046 case BRW_REGISTER_TYPE_UD:
1047 break;
1048 default:
1049 unreachable("Bad type for brw_AVG");
1050 }
1051
1052 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1053 }
1054
1055 brw_inst *
1056 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1057 struct brw_reg src0, struct brw_reg src1)
1058 {
1059 /* 6.32.38: mul */
1060 if (src0.type == BRW_REGISTER_TYPE_D ||
1061 src0.type == BRW_REGISTER_TYPE_UD ||
1062 src1.type == BRW_REGISTER_TYPE_D ||
1063 src1.type == BRW_REGISTER_TYPE_UD) {
1064 assert(dest.type != BRW_REGISTER_TYPE_F);
1065 }
1066
1067 if (src0.type == BRW_REGISTER_TYPE_F ||
1068 (src0.file == BRW_IMMEDIATE_VALUE &&
1069 src0.type == BRW_REGISTER_TYPE_VF)) {
1070 assert(src1.type != BRW_REGISTER_TYPE_UD);
1071 assert(src1.type != BRW_REGISTER_TYPE_D);
1072 }
1073
1074 if (src1.type == BRW_REGISTER_TYPE_F ||
1075 (src1.file == BRW_IMMEDIATE_VALUE &&
1076 src1.type == BRW_REGISTER_TYPE_VF)) {
1077 assert(src0.type != BRW_REGISTER_TYPE_UD);
1078 assert(src0.type != BRW_REGISTER_TYPE_D);
1079 }
1080
1081 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1082 src0.nr != BRW_ARF_ACCUMULATOR);
1083 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1084 src1.nr != BRW_ARF_ACCUMULATOR);
1085
1086 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1087 }
1088
1089 brw_inst *
1090 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1091 struct brw_reg src0, struct brw_reg src1)
1092 {
1093 src0.vstride = BRW_VERTICAL_STRIDE_0;
1094 src0.width = BRW_WIDTH_1;
1095 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1096 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1097 }
1098
1099 brw_inst *
1100 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1101 struct brw_reg src0, struct brw_reg src1)
1102 {
1103 src0.vstride = BRW_VERTICAL_STRIDE_0;
1104 src0.width = BRW_WIDTH_1;
1105 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1106 src1.vstride = BRW_VERTICAL_STRIDE_8;
1107 src1.width = BRW_WIDTH_8;
1108 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1109 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1110 }
1111
1112 brw_inst *
1113 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1114 {
1115 const struct gen_device_info *devinfo = p->devinfo;
1116 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1117 /* The F32TO16 instruction doesn't support 32-bit destination types in
1118 * Align1 mode, and neither does the Gen8 implementation in terms of a
1119 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1120 * an undocumented feature.
1121 */
1122 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1123 (!align16 || devinfo->gen >= 8));
1124 brw_inst *inst;
1125
1126 if (align16) {
1127 assert(dst.type == BRW_REGISTER_TYPE_UD);
1128 } else {
1129 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1130 dst.type == BRW_REGISTER_TYPE_W ||
1131 dst.type == BRW_REGISTER_TYPE_UW ||
1132 dst.type == BRW_REGISTER_TYPE_HF);
1133 }
1134
1135 brw_push_insn_state(p);
1136
1137 if (needs_zero_fill) {
1138 brw_set_default_access_mode(p, BRW_ALIGN_1);
1139 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1140 }
1141
1142 if (devinfo->gen >= 8) {
1143 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1144 } else {
1145 assert(devinfo->gen == 7);
1146 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1147 }
1148
1149 if (needs_zero_fill) {
1150 brw_inst_set_no_dd_clear(devinfo, inst, true);
1151 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1152 brw_inst_set_no_dd_check(devinfo, inst, true);
1153 }
1154
1155 brw_pop_insn_state(p);
1156 return inst;
1157 }
1158
1159 brw_inst *
1160 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1161 {
1162 const struct gen_device_info *devinfo = p->devinfo;
1163 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1164
1165 if (align16) {
1166 assert(src.type == BRW_REGISTER_TYPE_UD);
1167 } else {
1168 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1169 *
1170 * Because this instruction does not have a 16-bit floating-point
1171 * type, the source data type must be Word (W). The destination type
1172 * must be F (Float).
1173 */
1174 if (src.type == BRW_REGISTER_TYPE_UD)
1175 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1176
1177 assert(src.type == BRW_REGISTER_TYPE_W ||
1178 src.type == BRW_REGISTER_TYPE_UW ||
1179 src.type == BRW_REGISTER_TYPE_HF);
1180 }
1181
1182 if (devinfo->gen >= 8) {
1183 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1184 } else {
1185 assert(devinfo->gen == 7);
1186 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1187 }
1188 }
1189
1190
1191 void brw_NOP(struct brw_codegen *p)
1192 {
1193 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1194 memset(insn, 0, sizeof(*insn));
1195 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1196 }
1197
1198
1199
1200
1201
1202 /***********************************************************************
1203 * Comparisons, if/else/endif
1204 */
1205
1206 brw_inst *
1207 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1208 unsigned predicate_control)
1209 {
1210 const struct gen_device_info *devinfo = p->devinfo;
1211 struct brw_reg ip = brw_ip_reg();
1212 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1213
1214 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1215 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1216 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1217 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1218
1219 return inst;
1220 }
1221
1222 static void
1223 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1224 {
1225 p->if_stack[p->if_stack_depth] = inst - p->store;
1226
1227 p->if_stack_depth++;
1228 if (p->if_stack_array_size <= p->if_stack_depth) {
1229 p->if_stack_array_size *= 2;
1230 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1231 p->if_stack_array_size);
1232 }
1233 }
1234
1235 static brw_inst *
1236 pop_if_stack(struct brw_codegen *p)
1237 {
1238 p->if_stack_depth--;
1239 return &p->store[p->if_stack[p->if_stack_depth]];
1240 }
1241
1242 static void
1243 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1244 {
1245 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1246 p->loop_stack_array_size *= 2;
1247 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1248 p->loop_stack_array_size);
1249 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1250 p->loop_stack_array_size);
1251 }
1252
1253 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1254 p->loop_stack_depth++;
1255 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1256 }
1257
1258 static brw_inst *
1259 get_inner_do_insn(struct brw_codegen *p)
1260 {
1261 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1262 }
1263
1264 /* EU takes the value from the flag register and pushes it onto some
1265 * sort of a stack (presumably merging with any flag value already on
1266 * the stack). Within an if block, the flags at the top of the stack
1267 * control execution on each channel of the unit, eg. on each of the
1268 * 16 pixel values in our wm programs.
1269 *
1270 * When the matching 'else' instruction is reached (presumably by
1271 * countdown of the instruction count patched in by our ELSE/ENDIF
1272 * functions), the relevant flags are inverted.
1273 *
1274 * When the matching 'endif' instruction is reached, the flags are
1275 * popped off. If the stack is now empty, normal execution resumes.
1276 */
1277 brw_inst *
1278 brw_IF(struct brw_codegen *p, unsigned execute_size)
1279 {
1280 const struct gen_device_info *devinfo = p->devinfo;
1281 brw_inst *insn;
1282
1283 insn = next_insn(p, BRW_OPCODE_IF);
1284
1285 /* Override the defaults for this instruction:
1286 */
1287 if (devinfo->gen < 6) {
1288 brw_set_dest(p, insn, brw_ip_reg());
1289 brw_set_src0(p, insn, brw_ip_reg());
1290 brw_set_src1(p, insn, brw_imm_d(0x0));
1291 } else if (devinfo->gen == 6) {
1292 brw_set_dest(p, insn, brw_imm_w(0));
1293 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1294 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1295 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1296 } else if (devinfo->gen == 7) {
1297 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1298 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1299 brw_set_src1(p, insn, brw_imm_w(0));
1300 brw_inst_set_jip(devinfo, insn, 0);
1301 brw_inst_set_uip(devinfo, insn, 0);
1302 } else {
1303 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1304 brw_set_src0(p, insn, brw_imm_d(0));
1305 brw_inst_set_jip(devinfo, insn, 0);
1306 brw_inst_set_uip(devinfo, insn, 0);
1307 }
1308
1309 brw_inst_set_exec_size(devinfo, insn, execute_size);
1310 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1311 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1312 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1313 if (!p->single_program_flow && devinfo->gen < 6)
1314 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1315
1316 push_if_stack(p, insn);
1317 p->if_depth_in_loop[p->loop_stack_depth]++;
1318 return insn;
1319 }
1320
1321 /* This function is only used for gen6-style IF instructions with an
1322 * embedded comparison (conditional modifier). It is not used on gen7.
1323 */
1324 brw_inst *
1325 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1326 struct brw_reg src0, struct brw_reg src1)
1327 {
1328 const struct gen_device_info *devinfo = p->devinfo;
1329 brw_inst *insn;
1330
1331 insn = next_insn(p, BRW_OPCODE_IF);
1332
1333 brw_set_dest(p, insn, brw_imm_w(0));
1334 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1335 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1336 brw_set_src0(p, insn, src0);
1337 brw_set_src1(p, insn, src1);
1338
1339 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1340 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1341 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1342
1343 push_if_stack(p, insn);
1344 return insn;
1345 }
1346
1347 /**
1348 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1349 */
1350 static void
1351 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1352 brw_inst *if_inst, brw_inst *else_inst)
1353 {
1354 const struct gen_device_info *devinfo = p->devinfo;
1355
1356 /* The next instruction (where the ENDIF would be, if it existed) */
1357 brw_inst *next_inst = &p->store[p->nr_insn];
1358
1359 assert(p->single_program_flow);
1360 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1361 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1362 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1363
1364 /* Convert IF to an ADD instruction that moves the instruction pointer
1365 * to the first instruction of the ELSE block. If there is no ELSE
1366 * block, point to where ENDIF would be. Reverse the predicate.
1367 *
1368 * There's no need to execute an ENDIF since we don't need to do any
1369 * stack operations, and if we're currently executing, we just want to
1370 * continue normally.
1371 */
1372 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1373 brw_inst_set_pred_inv(devinfo, if_inst, true);
1374
1375 if (else_inst != NULL) {
1376 /* Convert ELSE to an ADD instruction that points where the ENDIF
1377 * would be.
1378 */
1379 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1380
1381 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1382 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1383 } else {
1384 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1385 }
1386 }
1387
1388 /**
1389 * Patch IF and ELSE instructions with appropriate jump targets.
1390 */
1391 static void
1392 patch_IF_ELSE(struct brw_codegen *p,
1393 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1394 {
1395 const struct gen_device_info *devinfo = p->devinfo;
1396
1397 /* We shouldn't be patching IF and ELSE instructions in single program flow
1398 * mode when gen < 6, because in single program flow mode on those
1399 * platforms, we convert flow control instructions to conditional ADDs that
1400 * operate on IP (see brw_ENDIF).
1401 *
1402 * However, on Gen6, writing to IP doesn't work in single program flow mode
1403 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1404 * not be updated by non-flow control instructions."). And on later
1405 * platforms, there is no significant benefit to converting control flow
1406 * instructions to conditional ADDs. So we do patch IF and ELSE
1407 * instructions in single program flow mode on those platforms.
1408 */
1409 if (devinfo->gen < 6)
1410 assert(!p->single_program_flow);
1411
1412 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1413 assert(endif_inst != NULL);
1414 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1415
1416 unsigned br = brw_jump_scale(devinfo);
1417
1418 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1419 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1420
1421 if (else_inst == NULL) {
1422 /* Patch IF -> ENDIF */
1423 if (devinfo->gen < 6) {
1424 /* Turn it into an IFF, which means no mask stack operations for
1425 * all-false and jumping past the ENDIF.
1426 */
1427 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1428 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1429 br * (endif_inst - if_inst + 1));
1430 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1431 } else if (devinfo->gen == 6) {
1432 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1433 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1434 } else {
1435 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1436 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1437 }
1438 } else {
1439 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1440
1441 /* Patch IF -> ELSE */
1442 if (devinfo->gen < 6) {
1443 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1444 br * (else_inst - if_inst));
1445 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1446 } else if (devinfo->gen == 6) {
1447 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1448 br * (else_inst - if_inst + 1));
1449 }
1450
1451 /* Patch ELSE -> ENDIF */
1452 if (devinfo->gen < 6) {
1453 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1454 * matching ENDIF.
1455 */
1456 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1457 br * (endif_inst - else_inst + 1));
1458 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1459 } else if (devinfo->gen == 6) {
1460 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1461 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1462 br * (endif_inst - else_inst));
1463 } else {
1464 /* The IF instruction's JIP should point just past the ELSE */
1465 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1466 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1467 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1468 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1469 if (devinfo->gen >= 8) {
1470 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1471 * should point to ENDIF.
1472 */
1473 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1474 }
1475 }
1476 }
1477 }
1478
1479 void
1480 brw_ELSE(struct brw_codegen *p)
1481 {
1482 const struct gen_device_info *devinfo = p->devinfo;
1483 brw_inst *insn;
1484
1485 insn = next_insn(p, BRW_OPCODE_ELSE);
1486
1487 if (devinfo->gen < 6) {
1488 brw_set_dest(p, insn, brw_ip_reg());
1489 brw_set_src0(p, insn, brw_ip_reg());
1490 brw_set_src1(p, insn, brw_imm_d(0x0));
1491 } else if (devinfo->gen == 6) {
1492 brw_set_dest(p, insn, brw_imm_w(0));
1493 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1494 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1495 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1496 } else if (devinfo->gen == 7) {
1497 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1498 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1499 brw_set_src1(p, insn, brw_imm_w(0));
1500 brw_inst_set_jip(devinfo, insn, 0);
1501 brw_inst_set_uip(devinfo, insn, 0);
1502 } else {
1503 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1504 brw_set_src0(p, insn, brw_imm_d(0));
1505 brw_inst_set_jip(devinfo, insn, 0);
1506 brw_inst_set_uip(devinfo, insn, 0);
1507 }
1508
1509 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1510 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1511 if (!p->single_program_flow && devinfo->gen < 6)
1512 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1513
1514 push_if_stack(p, insn);
1515 }
1516
1517 void
1518 brw_ENDIF(struct brw_codegen *p)
1519 {
1520 const struct gen_device_info *devinfo = p->devinfo;
1521 brw_inst *insn = NULL;
1522 brw_inst *else_inst = NULL;
1523 brw_inst *if_inst = NULL;
1524 brw_inst *tmp;
1525 bool emit_endif = true;
1526
1527 /* In single program flow mode, we can express IF and ELSE instructions
1528 * equivalently as ADD instructions that operate on IP. On platforms prior
1529 * to Gen6, flow control instructions cause an implied thread switch, so
1530 * this is a significant savings.
1531 *
1532 * However, on Gen6, writing to IP doesn't work in single program flow mode
1533 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1534 * not be updated by non-flow control instructions."). And on later
1535 * platforms, there is no significant benefit to converting control flow
1536 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1537 * Gen5.
1538 */
1539 if (devinfo->gen < 6 && p->single_program_flow)
1540 emit_endif = false;
1541
1542 /*
1543 * A single next_insn() may change the base address of instruction store
1544 * memory(p->store), so call it first before referencing the instruction
1545 * store pointer from an index
1546 */
1547 if (emit_endif)
1548 insn = next_insn(p, BRW_OPCODE_ENDIF);
1549
1550 /* Pop the IF and (optional) ELSE instructions from the stack */
1551 p->if_depth_in_loop[p->loop_stack_depth]--;
1552 tmp = pop_if_stack(p);
1553 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1554 else_inst = tmp;
1555 tmp = pop_if_stack(p);
1556 }
1557 if_inst = tmp;
1558
1559 if (!emit_endif) {
1560 /* ENDIF is useless; don't bother emitting it. */
1561 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1562 return;
1563 }
1564
1565 if (devinfo->gen < 6) {
1566 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1567 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1568 brw_set_src1(p, insn, brw_imm_d(0x0));
1569 } else if (devinfo->gen == 6) {
1570 brw_set_dest(p, insn, brw_imm_w(0));
1571 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1572 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1573 } else if (devinfo->gen == 7) {
1574 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1575 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1576 brw_set_src1(p, insn, brw_imm_w(0));
1577 } else {
1578 brw_set_src0(p, insn, brw_imm_d(0));
1579 }
1580
1581 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1582 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1583 if (devinfo->gen < 6)
1584 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1585
1586 /* Also pop item off the stack in the endif instruction: */
1587 if (devinfo->gen < 6) {
1588 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1589 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1590 } else if (devinfo->gen == 6) {
1591 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1592 } else {
1593 brw_inst_set_jip(devinfo, insn, 2);
1594 }
1595 patch_IF_ELSE(p, if_inst, else_inst, insn);
1596 }
1597
1598 brw_inst *
1599 brw_BREAK(struct brw_codegen *p)
1600 {
1601 const struct gen_device_info *devinfo = p->devinfo;
1602 brw_inst *insn;
1603
1604 insn = next_insn(p, BRW_OPCODE_BREAK);
1605 if (devinfo->gen >= 8) {
1606 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1607 brw_set_src0(p, insn, brw_imm_d(0x0));
1608 } else if (devinfo->gen >= 6) {
1609 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1610 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1611 brw_set_src1(p, insn, brw_imm_d(0x0));
1612 } else {
1613 brw_set_dest(p, insn, brw_ip_reg());
1614 brw_set_src0(p, insn, brw_ip_reg());
1615 brw_set_src1(p, insn, brw_imm_d(0x0));
1616 brw_inst_set_gen4_pop_count(devinfo, insn,
1617 p->if_depth_in_loop[p->loop_stack_depth]);
1618 }
1619 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1620 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1621
1622 return insn;
1623 }
1624
1625 brw_inst *
1626 brw_CONT(struct brw_codegen *p)
1627 {
1628 const struct gen_device_info *devinfo = p->devinfo;
1629 brw_inst *insn;
1630
1631 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1632 brw_set_dest(p, insn, brw_ip_reg());
1633 if (devinfo->gen >= 8) {
1634 brw_set_src0(p, insn, brw_imm_d(0x0));
1635 } else {
1636 brw_set_src0(p, insn, brw_ip_reg());
1637 brw_set_src1(p, insn, brw_imm_d(0x0));
1638 }
1639
1640 if (devinfo->gen < 6) {
1641 brw_inst_set_gen4_pop_count(devinfo, insn,
1642 p->if_depth_in_loop[p->loop_stack_depth]);
1643 }
1644 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1645 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1646 return insn;
1647 }
1648
1649 brw_inst *
1650 gen6_HALT(struct brw_codegen *p)
1651 {
1652 const struct gen_device_info *devinfo = p->devinfo;
1653 brw_inst *insn;
1654
1655 insn = next_insn(p, BRW_OPCODE_HALT);
1656 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1657 if (devinfo->gen >= 8) {
1658 brw_set_src0(p, insn, brw_imm_d(0x0));
1659 } else {
1660 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1661 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1662 }
1663
1664 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1665 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1666 return insn;
1667 }
1668
1669 /* DO/WHILE loop:
1670 *
1671 * The DO/WHILE is just an unterminated loop -- break or continue are
1672 * used for control within the loop. We have a few ways they can be
1673 * done.
1674 *
1675 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1676 * jip and no DO instruction.
1677 *
1678 * For non-uniform control flow pre-gen6, there's a DO instruction to
1679 * push the mask, and a WHILE to jump back, and BREAK to get out and
1680 * pop the mask.
1681 *
1682 * For gen6, there's no more mask stack, so no need for DO. WHILE
1683 * just points back to the first instruction of the loop.
1684 */
1685 brw_inst *
1686 brw_DO(struct brw_codegen *p, unsigned execute_size)
1687 {
1688 const struct gen_device_info *devinfo = p->devinfo;
1689
1690 if (devinfo->gen >= 6 || p->single_program_flow) {
1691 push_loop_stack(p, &p->store[p->nr_insn]);
1692 return &p->store[p->nr_insn];
1693 } else {
1694 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1695
1696 push_loop_stack(p, insn);
1697
1698 /* Override the defaults for this instruction:
1699 */
1700 brw_set_dest(p, insn, brw_null_reg());
1701 brw_set_src0(p, insn, brw_null_reg());
1702 brw_set_src1(p, insn, brw_null_reg());
1703
1704 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1705 brw_inst_set_exec_size(devinfo, insn, execute_size);
1706 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1707
1708 return insn;
1709 }
1710 }
1711
1712 /**
1713 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1714 * instruction here.
1715 *
1716 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1717 * nesting, since it can always just point to the end of the block/current loop.
1718 */
1719 static void
1720 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1721 {
1722 const struct gen_device_info *devinfo = p->devinfo;
1723 brw_inst *do_inst = get_inner_do_insn(p);
1724 brw_inst *inst;
1725 unsigned br = brw_jump_scale(devinfo);
1726
1727 assert(devinfo->gen < 6);
1728
1729 for (inst = while_inst - 1; inst != do_inst; inst--) {
1730 /* If the jump count is != 0, that means that this instruction has already
1731 * been patched because it's part of a loop inside of the one we're
1732 * patching.
1733 */
1734 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1735 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1736 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1737 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1738 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1739 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1740 }
1741 }
1742 }
1743
1744 brw_inst *
1745 brw_WHILE(struct brw_codegen *p)
1746 {
1747 const struct gen_device_info *devinfo = p->devinfo;
1748 brw_inst *insn, *do_insn;
1749 unsigned br = brw_jump_scale(devinfo);
1750
1751 if (devinfo->gen >= 6) {
1752 insn = next_insn(p, BRW_OPCODE_WHILE);
1753 do_insn = get_inner_do_insn(p);
1754
1755 if (devinfo->gen >= 8) {
1756 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1757 brw_set_src0(p, insn, brw_imm_d(0));
1758 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1759 } else if (devinfo->gen == 7) {
1760 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1761 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1762 brw_set_src1(p, insn, brw_imm_w(0));
1763 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1764 } else {
1765 brw_set_dest(p, insn, brw_imm_w(0));
1766 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1767 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1768 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1769 }
1770
1771 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1772
1773 } else {
1774 if (p->single_program_flow) {
1775 insn = next_insn(p, BRW_OPCODE_ADD);
1776 do_insn = get_inner_do_insn(p);
1777
1778 brw_set_dest(p, insn, brw_ip_reg());
1779 brw_set_src0(p, insn, brw_ip_reg());
1780 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1781 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1782 } else {
1783 insn = next_insn(p, BRW_OPCODE_WHILE);
1784 do_insn = get_inner_do_insn(p);
1785
1786 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1787
1788 brw_set_dest(p, insn, brw_ip_reg());
1789 brw_set_src0(p, insn, brw_ip_reg());
1790 brw_set_src1(p, insn, brw_imm_d(0));
1791
1792 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1793 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1794 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1795
1796 brw_patch_break_cont(p, insn);
1797 }
1798 }
1799 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1800
1801 p->loop_stack_depth--;
1802
1803 return insn;
1804 }
1805
1806 /* FORWARD JUMPS:
1807 */
1808 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1809 {
1810 const struct gen_device_info *devinfo = p->devinfo;
1811 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1812 unsigned jmpi = 1;
1813
1814 if (devinfo->gen >= 5)
1815 jmpi = 2;
1816
1817 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1818 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1819
1820 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1821 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1822 }
1823
1824 /* To integrate with the above, it makes sense that the comparison
1825 * instruction should populate the flag register. It might be simpler
1826 * just to use the flag reg for most WM tasks?
1827 */
1828 void brw_CMP(struct brw_codegen *p,
1829 struct brw_reg dest,
1830 unsigned conditional,
1831 struct brw_reg src0,
1832 struct brw_reg src1)
1833 {
1834 const struct gen_device_info *devinfo = p->devinfo;
1835 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1836
1837 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1838 brw_set_dest(p, insn, dest);
1839 brw_set_src0(p, insn, src0);
1840 brw_set_src1(p, insn, src1);
1841
1842 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1843 * page says:
1844 * "Any CMP instruction with a null destination must use a {switch}."
1845 *
1846 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1847 * mentioned on their work-arounds pages.
1848 */
1849 if (devinfo->gen == 7) {
1850 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1851 dest.nr == BRW_ARF_NULL) {
1852 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1853 }
1854 }
1855 }
1856
1857 /***********************************************************************
1858 * Helpers for the various SEND message types:
1859 */
1860
1861 /** Extended math function, float[8].
1862 */
1863 void gen4_math(struct brw_codegen *p,
1864 struct brw_reg dest,
1865 unsigned function,
1866 unsigned msg_reg_nr,
1867 struct brw_reg src,
1868 unsigned precision )
1869 {
1870 const struct gen_device_info *devinfo = p->devinfo;
1871 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1872 unsigned data_type;
1873 if (has_scalar_region(src)) {
1874 data_type = BRW_MATH_DATA_SCALAR;
1875 } else {
1876 data_type = BRW_MATH_DATA_VECTOR;
1877 }
1878
1879 assert(devinfo->gen < 6);
1880
1881 /* Example code doesn't set predicate_control for send
1882 * instructions.
1883 */
1884 brw_inst_set_pred_control(devinfo, insn, 0);
1885 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1886
1887 brw_set_dest(p, insn, dest);
1888 brw_set_src0(p, insn, src);
1889 brw_set_math_message(p,
1890 insn,
1891 function,
1892 src.type == BRW_REGISTER_TYPE_D,
1893 precision,
1894 data_type);
1895 }
1896
1897 void gen6_math(struct brw_codegen *p,
1898 struct brw_reg dest,
1899 unsigned function,
1900 struct brw_reg src0,
1901 struct brw_reg src1)
1902 {
1903 const struct gen_device_info *devinfo = p->devinfo;
1904 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1905
1906 assert(devinfo->gen >= 6);
1907
1908 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1909 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1910
1911 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1912 if (devinfo->gen == 6) {
1913 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1914 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1915 }
1916
1917 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1918 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1919 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1920 assert(src0.type != BRW_REGISTER_TYPE_F);
1921 assert(src1.type != BRW_REGISTER_TYPE_F);
1922 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1923 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1924 } else {
1925 assert(src0.type == BRW_REGISTER_TYPE_F);
1926 assert(src1.type == BRW_REGISTER_TYPE_F);
1927 }
1928
1929 /* Source modifiers are ignored for extended math instructions on Gen6. */
1930 if (devinfo->gen == 6) {
1931 assert(!src0.negate);
1932 assert(!src0.abs);
1933 assert(!src1.negate);
1934 assert(!src1.abs);
1935 }
1936
1937 brw_inst_set_math_function(devinfo, insn, function);
1938
1939 brw_set_dest(p, insn, dest);
1940 brw_set_src0(p, insn, src0);
1941 brw_set_src1(p, insn, src1);
1942 }
1943
1944 /**
1945 * Return the right surface index to access the thread scratch space using
1946 * stateless dataport messages.
1947 */
1948 unsigned
1949 brw_scratch_surface_idx(const struct brw_codegen *p)
1950 {
1951 /* The scratch space is thread-local so IA coherency is unnecessary. */
1952 if (p->devinfo->gen >= 8)
1953 return GEN8_BTI_STATELESS_NON_COHERENT;
1954 else
1955 return BRW_BTI_STATELESS;
1956 }
1957
1958 /**
1959 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1960 * using a constant offset per channel.
1961 *
1962 * The offset must be aligned to oword size (16 bytes). Used for
1963 * register spilling.
1964 */
1965 void brw_oword_block_write_scratch(struct brw_codegen *p,
1966 struct brw_reg mrf,
1967 int num_regs,
1968 unsigned offset)
1969 {
1970 const struct gen_device_info *devinfo = p->devinfo;
1971 const unsigned target_cache =
1972 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1973 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1974 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
1975 uint32_t msg_type;
1976
1977 if (devinfo->gen >= 6)
1978 offset /= 16;
1979
1980 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1981
1982 const unsigned mlen = 1 + num_regs;
1983
1984 /* Set up the message header. This is g0, with g0.2 filled with
1985 * the offset. We don't want to leave our offset around in g0 or
1986 * it'll screw up texture samples, so set it up inside the message
1987 * reg.
1988 */
1989 {
1990 brw_push_insn_state(p);
1991 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1992 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1993 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1994
1995 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1996
1997 /* set message header global offset field (reg 0, element 2) */
1998 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1999 brw_MOV(p,
2000 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2001 mrf.nr,
2002 2), BRW_REGISTER_TYPE_UD),
2003 brw_imm_ud(offset));
2004
2005 brw_pop_insn_state(p);
2006 }
2007
2008 {
2009 struct brw_reg dest;
2010 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2011 int send_commit_msg;
2012 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2013 BRW_REGISTER_TYPE_UW);
2014
2015 brw_inst_set_compression(devinfo, insn, false);
2016
2017 if (brw_inst_exec_size(devinfo, insn) >= 16)
2018 src_header = vec16(src_header);
2019
2020 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2021 if (devinfo->gen < 6)
2022 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2023
2024 /* Until gen6, writes followed by reads from the same location
2025 * are not guaranteed to be ordered unless write_commit is set.
2026 * If set, then a no-op write is issued to the destination
2027 * register to set a dependency, and a read from the destination
2028 * can be used to ensure the ordering.
2029 *
2030 * For gen6, only writes between different threads need ordering
2031 * protection. Our use of DP writes is all about register
2032 * spilling within a thread.
2033 */
2034 if (devinfo->gen >= 6) {
2035 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2036 send_commit_msg = 0;
2037 } else {
2038 dest = src_header;
2039 send_commit_msg = 1;
2040 }
2041
2042 brw_set_dest(p, insn, dest);
2043 if (devinfo->gen >= 6) {
2044 brw_set_src0(p, insn, mrf);
2045 } else {
2046 brw_set_src0(p, insn, brw_null_reg());
2047 }
2048
2049 if (devinfo->gen >= 6)
2050 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2051 else
2052 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2053
2054 brw_set_dp_write_message(p,
2055 insn,
2056 brw_scratch_surface_idx(p),
2057 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2058 msg_type,
2059 target_cache,
2060 mlen,
2061 true, /* header_present */
2062 0, /* not a render target */
2063 send_commit_msg, /* response_length */
2064 0, /* eot */
2065 send_commit_msg);
2066 }
2067 }
2068
2069
2070 /**
2071 * Read a block of owords (half a GRF each) from the scratch buffer
2072 * using a constant index per channel.
2073 *
2074 * Offset must be aligned to oword size (16 bytes). Used for register
2075 * spilling.
2076 */
2077 void
2078 brw_oword_block_read_scratch(struct brw_codegen *p,
2079 struct brw_reg dest,
2080 struct brw_reg mrf,
2081 int num_regs,
2082 unsigned offset)
2083 {
2084 const struct gen_device_info *devinfo = p->devinfo;
2085
2086 if (devinfo->gen >= 6)
2087 offset /= 16;
2088
2089 if (p->devinfo->gen >= 7) {
2090 /* On gen 7 and above, we no longer have message registers and we can
2091 * send from any register we want. By using the destination register
2092 * for the message, we guarantee that the implied message write won't
2093 * accidentally overwrite anything. This has been a problem because
2094 * the MRF registers and source for the final FB write are both fixed
2095 * and may overlap.
2096 */
2097 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2098 } else {
2099 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2100 }
2101 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2102
2103 const unsigned rlen = num_regs;
2104 const unsigned target_cache =
2105 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2106 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2107 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2108
2109 {
2110 brw_push_insn_state(p);
2111 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2112 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2113 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2114
2115 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2116
2117 /* set message header global offset field (reg 0, element 2) */
2118 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2119 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2120
2121 brw_pop_insn_state(p);
2122 }
2123
2124 {
2125 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2126
2127 assert(brw_inst_pred_control(devinfo, insn) == 0);
2128 brw_inst_set_compression(devinfo, insn, false);
2129
2130 brw_set_dest(p, insn, dest); /* UW? */
2131 if (devinfo->gen >= 6) {
2132 brw_set_src0(p, insn, mrf);
2133 } else {
2134 brw_set_src0(p, insn, brw_null_reg());
2135 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2136 }
2137
2138 brw_set_dp_read_message(p,
2139 insn,
2140 brw_scratch_surface_idx(p),
2141 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2142 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2143 target_cache,
2144 1, /* msg_length */
2145 true, /* header_present */
2146 rlen);
2147 }
2148 }
2149
2150 void
2151 gen7_block_read_scratch(struct brw_codegen *p,
2152 struct brw_reg dest,
2153 int num_regs,
2154 unsigned offset)
2155 {
2156 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2157 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2158
2159 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2160
2161 /* The HW requires that the header is present; this is to get the g0.5
2162 * scratch offset.
2163 */
2164 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2165
2166 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2167 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2168 * is 32 bytes, which happens to be the size of a register.
2169 */
2170 offset /= REG_SIZE;
2171 assert(offset < (1 << 12));
2172
2173 gen7_set_dp_scratch_message(p, insn,
2174 false, /* scratch read */
2175 false, /* OWords */
2176 false, /* invalidate after read */
2177 num_regs,
2178 offset,
2179 1, /* mlen: just g0 */
2180 num_regs, /* rlen */
2181 true); /* header present */
2182 }
2183
2184 /**
2185 * Read float[4] vectors from the data port constant cache.
2186 * Location (in buffer) should be a multiple of 16.
2187 * Used for fetching shader constants.
2188 */
2189 void brw_oword_block_read(struct brw_codegen *p,
2190 struct brw_reg dest,
2191 struct brw_reg mrf,
2192 uint32_t offset,
2193 uint32_t bind_table_index)
2194 {
2195 const struct gen_device_info *devinfo = p->devinfo;
2196 const unsigned target_cache =
2197 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2198 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2199 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2200
2201 /* On newer hardware, offset is in units of owords. */
2202 if (devinfo->gen >= 6)
2203 offset /= 16;
2204
2205 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2206
2207 brw_push_insn_state(p);
2208 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2209 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2210 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2211
2212 brw_push_insn_state(p);
2213 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2214 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2215
2216 /* set message header global offset field (reg 0, element 2) */
2217 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2218 brw_MOV(p,
2219 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2220 mrf.nr,
2221 2), BRW_REGISTER_TYPE_UD),
2222 brw_imm_ud(offset));
2223 brw_pop_insn_state(p);
2224
2225 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2226
2227 /* cast dest to a uword[8] vector */
2228 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2229
2230 brw_set_dest(p, insn, dest);
2231 if (devinfo->gen >= 6) {
2232 brw_set_src0(p, insn, mrf);
2233 } else {
2234 brw_set_src0(p, insn, brw_null_reg());
2235 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2236 }
2237
2238 brw_set_dp_read_message(p, insn, bind_table_index,
2239 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2240 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2241 target_cache,
2242 1, /* msg_length */
2243 true, /* header_present */
2244 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2245
2246 brw_pop_insn_state(p);
2247 }
2248
2249 brw_inst *
2250 brw_fb_WRITE(struct brw_codegen *p,
2251 struct brw_reg payload,
2252 struct brw_reg implied_header,
2253 unsigned msg_control,
2254 unsigned binding_table_index,
2255 unsigned msg_length,
2256 unsigned response_length,
2257 bool eot,
2258 bool last_render_target,
2259 bool header_present)
2260 {
2261 const struct gen_device_info *devinfo = p->devinfo;
2262 const unsigned target_cache =
2263 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2264 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2265 brw_inst *insn;
2266 unsigned msg_type;
2267 struct brw_reg dest, src0;
2268
2269 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2270 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2271 else
2272 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2273
2274 if (devinfo->gen >= 6) {
2275 insn = next_insn(p, BRW_OPCODE_SENDC);
2276 } else {
2277 insn = next_insn(p, BRW_OPCODE_SEND);
2278 }
2279 brw_inst_set_compression(devinfo, insn, false);
2280
2281 if (devinfo->gen >= 6) {
2282 /* headerless version, just submit color payload */
2283 src0 = payload;
2284
2285 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2286 } else {
2287 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2288 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2289 src0 = implied_header;
2290
2291 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2292 }
2293
2294 brw_set_dest(p, insn, dest);
2295 brw_set_src0(p, insn, src0);
2296 brw_set_dp_write_message(p,
2297 insn,
2298 binding_table_index,
2299 msg_control,
2300 msg_type,
2301 target_cache,
2302 msg_length,
2303 header_present,
2304 last_render_target,
2305 response_length,
2306 eot,
2307 0 /* send_commit_msg */);
2308
2309 return insn;
2310 }
2311
2312 brw_inst *
2313 gen9_fb_READ(struct brw_codegen *p,
2314 struct brw_reg dst,
2315 struct brw_reg payload,
2316 unsigned binding_table_index,
2317 unsigned msg_length,
2318 unsigned response_length,
2319 bool per_sample)
2320 {
2321 const struct gen_device_info *devinfo = p->devinfo;
2322 assert(devinfo->gen >= 9);
2323 const unsigned msg_subtype =
2324 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2325 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2326
2327 brw_set_dest(p, insn, dst);
2328 brw_set_src0(p, insn, payload);
2329 brw_set_dp_read_message(p, insn, binding_table_index,
2330 per_sample << 5 | msg_subtype,
2331 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2332 GEN6_SFID_DATAPORT_RENDER_CACHE,
2333 msg_length, true /* header_present */,
2334 response_length);
2335 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2336
2337 return insn;
2338 }
2339
2340 /**
2341 * Texture sample instruction.
2342 * Note: the msg_type plus msg_length values determine exactly what kind
2343 * of sampling operation is performed. See volume 4, page 161 of docs.
2344 */
2345 void brw_SAMPLE(struct brw_codegen *p,
2346 struct brw_reg dest,
2347 unsigned msg_reg_nr,
2348 struct brw_reg src0,
2349 unsigned binding_table_index,
2350 unsigned sampler,
2351 unsigned msg_type,
2352 unsigned response_length,
2353 unsigned msg_length,
2354 unsigned header_present,
2355 unsigned simd_mode,
2356 unsigned return_format)
2357 {
2358 const struct gen_device_info *devinfo = p->devinfo;
2359 brw_inst *insn;
2360
2361 if (msg_reg_nr != -1)
2362 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2363
2364 insn = next_insn(p, BRW_OPCODE_SEND);
2365 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2366 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2367
2368 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2369 *
2370 * "Instruction compression is not allowed for this instruction (that
2371 * is, send). The hardware behavior is undefined if this instruction is
2372 * set as compressed. However, compress control can be set to "SecHalf"
2373 * to affect the EMask generation."
2374 *
2375 * No similar wording is found in later PRMs, but there are examples
2376 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2377 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2378 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2379 */
2380 brw_inst_set_compression(devinfo, insn, false);
2381
2382 if (devinfo->gen < 6)
2383 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2384
2385 brw_set_dest(p, insn, dest);
2386 brw_set_src0(p, insn, src0);
2387 brw_set_desc(p, insn,
2388 brw_message_desc(devinfo, msg_length, response_length,
2389 header_present) |
2390 brw_sampler_desc(devinfo, binding_table_index, sampler,
2391 msg_type, simd_mode, return_format));
2392 }
2393
2394 /* Adjust the message header's sampler state pointer to
2395 * select the correct group of 16 samplers.
2396 */
2397 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2398 struct brw_reg header,
2399 struct brw_reg sampler_index)
2400 {
2401 /* The "Sampler Index" field can only store values between 0 and 15.
2402 * However, we can add an offset to the "Sampler State Pointer"
2403 * field, effectively selecting a different set of 16 samplers.
2404 *
2405 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2406 * offset, and each sampler state is only 16-bytes, so we can't
2407 * exclusively use the offset - we have to use both.
2408 */
2409
2410 const struct gen_device_info *devinfo = p->devinfo;
2411
2412 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2413 const int sampler_state_size = 16; /* 16 bytes */
2414 uint32_t sampler = sampler_index.ud;
2415
2416 if (sampler >= 16) {
2417 assert(devinfo->is_haswell || devinfo->gen >= 8);
2418 brw_ADD(p,
2419 get_element_ud(header, 3),
2420 get_element_ud(brw_vec8_grf(0, 0), 3),
2421 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2422 }
2423 } else {
2424 /* Non-const sampler array indexing case */
2425 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2426 return;
2427 }
2428
2429 struct brw_reg temp = get_element_ud(header, 3);
2430
2431 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2432 brw_SHL(p, temp, temp, brw_imm_ud(4));
2433 brw_ADD(p,
2434 get_element_ud(header, 3),
2435 get_element_ud(brw_vec8_grf(0, 0), 3),
2436 temp);
2437 }
2438 }
2439
2440 /* All these variables are pretty confusing - we might be better off
2441 * using bitmasks and macros for this, in the old style. Or perhaps
2442 * just having the caller instantiate the fields in dword3 itself.
2443 */
2444 void brw_urb_WRITE(struct brw_codegen *p,
2445 struct brw_reg dest,
2446 unsigned msg_reg_nr,
2447 struct brw_reg src0,
2448 enum brw_urb_write_flags flags,
2449 unsigned msg_length,
2450 unsigned response_length,
2451 unsigned offset,
2452 unsigned swizzle)
2453 {
2454 const struct gen_device_info *devinfo = p->devinfo;
2455 brw_inst *insn;
2456
2457 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2458
2459 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2460 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2461 brw_push_insn_state(p);
2462 brw_set_default_access_mode(p, BRW_ALIGN_1);
2463 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2464 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2465 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2466 BRW_REGISTER_TYPE_UD),
2467 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2468 brw_imm_ud(0xff00));
2469 brw_pop_insn_state(p);
2470 }
2471
2472 insn = next_insn(p, BRW_OPCODE_SEND);
2473
2474 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2475
2476 brw_set_dest(p, insn, dest);
2477 brw_set_src0(p, insn, src0);
2478 brw_set_src1(p, insn, brw_imm_d(0));
2479
2480 if (devinfo->gen < 6)
2481 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2482
2483 brw_set_urb_message(p,
2484 insn,
2485 flags,
2486 msg_length,
2487 response_length,
2488 offset,
2489 swizzle);
2490 }
2491
2492 struct brw_inst *
2493 brw_send_indirect_message(struct brw_codegen *p,
2494 unsigned sfid,
2495 struct brw_reg dst,
2496 struct brw_reg payload,
2497 struct brw_reg desc,
2498 unsigned desc_imm)
2499 {
2500 const struct gen_device_info *devinfo = p->devinfo;
2501 struct brw_inst *send;
2502 int setup;
2503
2504 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2505
2506 assert(desc.type == BRW_REGISTER_TYPE_UD);
2507
2508 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2509 * in the indirect case) by its index in the instruction store. The
2510 * pointer returned by next_insn() may become invalid if emitting the SEND
2511 * in the indirect case reallocs the store.
2512 */
2513
2514 if (desc.file == BRW_IMMEDIATE_VALUE) {
2515 setup = p->nr_insn;
2516 send = next_insn(p, BRW_OPCODE_SEND);
2517 brw_set_desc(p, send, desc.ud | desc_imm);
2518
2519 } else {
2520 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2521
2522 brw_push_insn_state(p);
2523 brw_set_default_access_mode(p, BRW_ALIGN_1);
2524 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2525 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2526 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2527
2528 /* Load the indirect descriptor to an address register using OR so the
2529 * caller can specify additional descriptor bits with the usual
2530 * brw_set_*_message() helper functions.
2531 */
2532 setup = p->nr_insn;
2533 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2534
2535 brw_pop_insn_state(p);
2536
2537 send = next_insn(p, BRW_OPCODE_SEND);
2538 brw_set_src1(p, send, addr);
2539 }
2540
2541 if (dst.width < BRW_EXECUTE_8)
2542 brw_inst_set_exec_size(devinfo, send, dst.width);
2543
2544 brw_set_dest(p, send, dst);
2545 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2546 brw_inst_set_sfid(devinfo, send, sfid);
2547
2548 return &p->store[setup];
2549 }
2550
2551 static struct brw_inst *
2552 brw_send_indirect_surface_message(struct brw_codegen *p,
2553 unsigned sfid,
2554 struct brw_reg dst,
2555 struct brw_reg payload,
2556 struct brw_reg surface,
2557 unsigned message_len,
2558 unsigned response_len,
2559 bool header_present)
2560 {
2561 const struct gen_device_info *devinfo = p->devinfo;
2562 struct brw_inst *insn;
2563
2564 if (surface.file != BRW_IMMEDIATE_VALUE) {
2565 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2566
2567 brw_push_insn_state(p);
2568 brw_set_default_access_mode(p, BRW_ALIGN_1);
2569 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2570 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2571 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2572
2573 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2574 * some surface array is accessed out of bounds.
2575 */
2576 insn = brw_AND(p, addr,
2577 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2578 BRW_GET_SWZ(surface.swizzle, 0)),
2579 brw_imm_ud(0xff));
2580
2581 brw_pop_insn_state(p);
2582
2583 surface = addr;
2584 }
2585
2586 insn = brw_send_indirect_message(p, sfid, dst, payload, surface, 0);
2587 brw_inst_set_mlen(devinfo, insn, message_len);
2588 brw_inst_set_rlen(devinfo, insn, response_len);
2589 brw_inst_set_header_present(devinfo, insn, header_present);
2590
2591 return insn;
2592 }
2593
2594 static bool
2595 while_jumps_before_offset(const struct gen_device_info *devinfo,
2596 brw_inst *insn, int while_offset, int start_offset)
2597 {
2598 int scale = 16 / brw_jump_scale(devinfo);
2599 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2600 : brw_inst_jip(devinfo, insn);
2601 assert(jip < 0);
2602 return while_offset + jip * scale <= start_offset;
2603 }
2604
2605
2606 static int
2607 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2608 {
2609 int offset;
2610 void *store = p->store;
2611 const struct gen_device_info *devinfo = p->devinfo;
2612
2613 int depth = 0;
2614
2615 for (offset = next_offset(devinfo, store, start_offset);
2616 offset < p->next_insn_offset;
2617 offset = next_offset(devinfo, store, offset)) {
2618 brw_inst *insn = store + offset;
2619
2620 switch (brw_inst_opcode(devinfo, insn)) {
2621 case BRW_OPCODE_IF:
2622 depth++;
2623 break;
2624 case BRW_OPCODE_ENDIF:
2625 if (depth == 0)
2626 return offset;
2627 depth--;
2628 break;
2629 case BRW_OPCODE_WHILE:
2630 /* If the while doesn't jump before our instruction, it's the end
2631 * of a sibling do...while loop. Ignore it.
2632 */
2633 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2634 continue;
2635 /* fallthrough */
2636 case BRW_OPCODE_ELSE:
2637 case BRW_OPCODE_HALT:
2638 if (depth == 0)
2639 return offset;
2640 }
2641 }
2642
2643 return 0;
2644 }
2645
2646 /* There is no DO instruction on gen6, so to find the end of the loop
2647 * we have to see if the loop is jumping back before our start
2648 * instruction.
2649 */
2650 static int
2651 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2652 {
2653 const struct gen_device_info *devinfo = p->devinfo;
2654 int offset;
2655 void *store = p->store;
2656
2657 assert(devinfo->gen >= 6);
2658
2659 /* Always start after the instruction (such as a WHILE) we're trying to fix
2660 * up.
2661 */
2662 for (offset = next_offset(devinfo, store, start_offset);
2663 offset < p->next_insn_offset;
2664 offset = next_offset(devinfo, store, offset)) {
2665 brw_inst *insn = store + offset;
2666
2667 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2668 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2669 return offset;
2670 }
2671 }
2672 assert(!"not reached");
2673 return start_offset;
2674 }
2675
2676 /* After program generation, go back and update the UIP and JIP of
2677 * BREAK, CONT, and HALT instructions to their correct locations.
2678 */
2679 void
2680 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2681 {
2682 const struct gen_device_info *devinfo = p->devinfo;
2683 int offset;
2684 int br = brw_jump_scale(devinfo);
2685 int scale = 16 / br;
2686 void *store = p->store;
2687
2688 if (devinfo->gen < 6)
2689 return;
2690
2691 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2692 brw_inst *insn = store + offset;
2693 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2694
2695 int block_end_offset = brw_find_next_block_end(p, offset);
2696 switch (brw_inst_opcode(devinfo, insn)) {
2697 case BRW_OPCODE_BREAK:
2698 assert(block_end_offset != 0);
2699 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2700 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2701 brw_inst_set_uip(devinfo, insn,
2702 (brw_find_loop_end(p, offset) - offset +
2703 (devinfo->gen == 6 ? 16 : 0)) / scale);
2704 break;
2705 case BRW_OPCODE_CONTINUE:
2706 assert(block_end_offset != 0);
2707 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2708 brw_inst_set_uip(devinfo, insn,
2709 (brw_find_loop_end(p, offset) - offset) / scale);
2710
2711 assert(brw_inst_uip(devinfo, insn) != 0);
2712 assert(brw_inst_jip(devinfo, insn) != 0);
2713 break;
2714
2715 case BRW_OPCODE_ENDIF: {
2716 int32_t jump = (block_end_offset == 0) ?
2717 1 * br : (block_end_offset - offset) / scale;
2718 if (devinfo->gen >= 7)
2719 brw_inst_set_jip(devinfo, insn, jump);
2720 else
2721 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2722 break;
2723 }
2724
2725 case BRW_OPCODE_HALT:
2726 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2727 *
2728 * "In case of the halt instruction not inside any conditional
2729 * code block, the value of <JIP> and <UIP> should be the
2730 * same. In case of the halt instruction inside conditional code
2731 * block, the <UIP> should be the end of the program, and the
2732 * <JIP> should be end of the most inner conditional code block."
2733 *
2734 * The uip will have already been set by whoever set up the
2735 * instruction.
2736 */
2737 if (block_end_offset == 0) {
2738 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2739 } else {
2740 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2741 }
2742 assert(brw_inst_uip(devinfo, insn) != 0);
2743 assert(brw_inst_jip(devinfo, insn) != 0);
2744 break;
2745 }
2746 }
2747 }
2748
2749 void brw_ff_sync(struct brw_codegen *p,
2750 struct brw_reg dest,
2751 unsigned msg_reg_nr,
2752 struct brw_reg src0,
2753 bool allocate,
2754 unsigned response_length,
2755 bool eot)
2756 {
2757 const struct gen_device_info *devinfo = p->devinfo;
2758 brw_inst *insn;
2759
2760 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2761
2762 insn = next_insn(p, BRW_OPCODE_SEND);
2763 brw_set_dest(p, insn, dest);
2764 brw_set_src0(p, insn, src0);
2765 brw_set_src1(p, insn, brw_imm_d(0));
2766
2767 if (devinfo->gen < 6)
2768 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2769
2770 brw_set_ff_sync_message(p,
2771 insn,
2772 allocate,
2773 response_length,
2774 eot);
2775 }
2776
2777 /**
2778 * Emit the SEND instruction necessary to generate stream output data on Gen6
2779 * (for transform feedback).
2780 *
2781 * If send_commit_msg is true, this is the last piece of stream output data
2782 * from this thread, so send the data as a committed write. According to the
2783 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2784 *
2785 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2786 * writes are complete by sending the final write as a committed write."
2787 */
2788 void
2789 brw_svb_write(struct brw_codegen *p,
2790 struct brw_reg dest,
2791 unsigned msg_reg_nr,
2792 struct brw_reg src0,
2793 unsigned binding_table_index,
2794 bool send_commit_msg)
2795 {
2796 const struct gen_device_info *devinfo = p->devinfo;
2797 const unsigned target_cache =
2798 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2799 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2800 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2801 brw_inst *insn;
2802
2803 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2804
2805 insn = next_insn(p, BRW_OPCODE_SEND);
2806 brw_set_dest(p, insn, dest);
2807 brw_set_src0(p, insn, src0);
2808 brw_set_src1(p, insn, brw_imm_d(0));
2809 brw_set_dp_write_message(p, insn,
2810 binding_table_index,
2811 0, /* msg_control: ignored */
2812 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2813 target_cache,
2814 1, /* msg_length */
2815 true, /* header_present */
2816 0, /* last_render_target: ignored */
2817 send_commit_msg, /* response_length */
2818 0, /* end_of_thread */
2819 send_commit_msg); /* send_commit_msg */
2820 }
2821
2822 static unsigned
2823 brw_surface_payload_size(struct brw_codegen *p,
2824 unsigned num_channels,
2825 bool has_simd4x2,
2826 bool has_simd16)
2827 {
2828 if (has_simd4x2 && brw_get_default_access_mode(p) == BRW_ALIGN_16)
2829 return 1;
2830 else if (has_simd16 && brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2831 return 2 * num_channels;
2832 else
2833 return num_channels;
2834 }
2835
2836 static void
2837 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2838 brw_inst *insn,
2839 unsigned atomic_op,
2840 bool response_expected)
2841 {
2842 const struct gen_device_info *devinfo = p->devinfo;
2843 unsigned msg_control =
2844 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2845 (response_expected ? 1 << 5 : 0); /* Return data expected */
2846
2847 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2848 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2849 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2850 msg_control |= 1 << 4; /* SIMD8 mode */
2851
2852 brw_inst_set_dp_msg_type(devinfo, insn,
2853 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2854 } else {
2855 brw_inst_set_dp_msg_type(devinfo, insn,
2856 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2857 }
2858 } else {
2859 brw_inst_set_dp_msg_type(devinfo, insn,
2860 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2861
2862 if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
2863 msg_control |= 1 << 4; /* SIMD8 mode */
2864 }
2865
2866 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2867 }
2868
2869 void
2870 brw_untyped_atomic(struct brw_codegen *p,
2871 struct brw_reg dst,
2872 struct brw_reg payload,
2873 struct brw_reg surface,
2874 unsigned atomic_op,
2875 unsigned msg_length,
2876 bool response_expected,
2877 bool header_present)
2878 {
2879 const struct gen_device_info *devinfo = p->devinfo;
2880 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2881 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2882 GEN7_SFID_DATAPORT_DATA_CACHE);
2883 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2884 /* Mask out unused components -- This is especially important in Align16
2885 * mode on generations that don't have native support for SIMD4x2 atomics,
2886 * because unused but enabled components will cause the dataport to perform
2887 * additional atomic operations on the addresses that happen to be in the
2888 * uninitialized Y, Z and W coordinates of the payload.
2889 */
2890 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2891 struct brw_inst *insn = brw_send_indirect_surface_message(
2892 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2893 brw_surface_payload_size(p, response_expected,
2894 devinfo->gen >= 8 || devinfo->is_haswell, true),
2895 header_present);
2896
2897 brw_set_dp_untyped_atomic_message(
2898 p, insn, atomic_op, response_expected);
2899 }
2900
2901 static void
2902 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2903 struct brw_inst *insn,
2904 unsigned num_channels)
2905 {
2906 const struct gen_device_info *devinfo = p->devinfo;
2907 /* Set mask of 32-bit channels to drop. */
2908 unsigned msg_control = 0xf & (0xf << num_channels);
2909
2910 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2911 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2912 msg_control |= 1 << 4; /* SIMD16 mode */
2913 else
2914 msg_control |= 2 << 4; /* SIMD8 mode */
2915 }
2916
2917 brw_inst_set_dp_msg_type(devinfo, insn,
2918 (devinfo->gen >= 8 || devinfo->is_haswell ?
2919 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2920 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2921 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2922 }
2923
2924 void
2925 brw_untyped_surface_read(struct brw_codegen *p,
2926 struct brw_reg dst,
2927 struct brw_reg payload,
2928 struct brw_reg surface,
2929 unsigned msg_length,
2930 unsigned num_channels)
2931 {
2932 const struct gen_device_info *devinfo = p->devinfo;
2933 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2934 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2935 GEN7_SFID_DATAPORT_DATA_CACHE);
2936 struct brw_inst *insn = brw_send_indirect_surface_message(
2937 p, sfid, dst, payload, surface, msg_length,
2938 brw_surface_payload_size(p, num_channels, true, true),
2939 false);
2940
2941 brw_set_dp_untyped_surface_read_message(
2942 p, insn, num_channels);
2943 }
2944
2945 static void
2946 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2947 struct brw_inst *insn,
2948 unsigned num_channels)
2949 {
2950 const struct gen_device_info *devinfo = p->devinfo;
2951 /* Set mask of 32-bit channels to drop. */
2952 unsigned msg_control = 0xf & (0xf << num_channels);
2953
2954 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
2955 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
2956 msg_control |= 1 << 4; /* SIMD16 mode */
2957 else
2958 msg_control |= 2 << 4; /* SIMD8 mode */
2959 } else {
2960 if (devinfo->gen >= 8 || devinfo->is_haswell)
2961 msg_control |= 0 << 4; /* SIMD4x2 mode */
2962 else
2963 msg_control |= 2 << 4; /* SIMD8 mode */
2964 }
2965
2966 brw_inst_set_dp_msg_type(devinfo, insn,
2967 devinfo->gen >= 8 || devinfo->is_haswell ?
2968 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
2969 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
2970 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2971 }
2972
2973 void
2974 brw_untyped_surface_write(struct brw_codegen *p,
2975 struct brw_reg payload,
2976 struct brw_reg surface,
2977 unsigned msg_length,
2978 unsigned num_channels,
2979 bool header_present)
2980 {
2981 const struct gen_device_info *devinfo = p->devinfo;
2982 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2983 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2984 GEN7_SFID_DATAPORT_DATA_CACHE);
2985 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2986 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2987 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
2988 WRITEMASK_X : WRITEMASK_XYZW;
2989 struct brw_inst *insn = brw_send_indirect_surface_message(
2990 p, sfid, brw_writemask(brw_null_reg(), mask),
2991 payload, surface, msg_length, 0, header_present);
2992
2993 brw_set_dp_untyped_surface_write_message(
2994 p, insn, num_channels);
2995 }
2996
2997 static unsigned
2998 brw_byte_scattered_data_element_from_bit_size(unsigned bit_size)
2999 {
3000 switch (bit_size) {
3001 case 8:
3002 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_BYTE;
3003 case 16:
3004 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_WORD;
3005 case 32:
3006 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_DWORD;
3007 default:
3008 unreachable("Unsupported bit_size for byte scattered messages");
3009 }
3010 }
3011
3012
3013 void
3014 brw_byte_scattered_read(struct brw_codegen *p,
3015 struct brw_reg dst,
3016 struct brw_reg payload,
3017 struct brw_reg surface,
3018 unsigned msg_length,
3019 unsigned bit_size)
3020 {
3021 const struct gen_device_info *devinfo = p->devinfo;
3022 assert(devinfo->gen > 7 || devinfo->is_haswell);
3023 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3024 const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3025
3026 struct brw_inst *insn = brw_send_indirect_surface_message(
3027 p, sfid, dst, payload, surface, msg_length,
3028 brw_surface_payload_size(p, 1, true, true),
3029 false);
3030
3031 unsigned msg_control =
3032 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3033
3034 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3035 msg_control |= 1; /* SIMD16 mode */
3036 else
3037 msg_control |= 0; /* SIMD8 mode */
3038
3039 brw_inst_set_dp_msg_type(devinfo, insn,
3040 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
3041 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3042 }
3043
3044 void
3045 brw_byte_scattered_write(struct brw_codegen *p,
3046 struct brw_reg payload,
3047 struct brw_reg surface,
3048 unsigned msg_length,
3049 unsigned bit_size,
3050 bool header_present)
3051 {
3052 const struct gen_device_info *devinfo = p->devinfo;
3053 assert(devinfo->gen > 7 || devinfo->is_haswell);
3054 assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
3055 const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3056
3057 struct brw_inst *insn = brw_send_indirect_surface_message(
3058 p, sfid, brw_writemask(brw_null_reg(), WRITEMASK_XYZW),
3059 payload, surface, msg_length, 0, header_present);
3060
3061 unsigned msg_control =
3062 brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3063
3064 if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
3065 msg_control |= 1;
3066 else
3067 msg_control |= 0;
3068
3069 brw_inst_set_dp_msg_type(devinfo, insn,
3070 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
3071 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3072 }
3073
3074 static void
3075 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3076 struct brw_inst *insn,
3077 unsigned atomic_op,
3078 bool response_expected)
3079 {
3080 const struct gen_device_info *devinfo = p->devinfo;
3081 unsigned msg_control =
3082 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3083 (response_expected ? 1 << 5 : 0); /* Return data expected */
3084
3085 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3086 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3087 if ((brw_get_default_group(p) / 8) % 2 == 1)
3088 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3089
3090 brw_inst_set_dp_msg_type(devinfo, insn,
3091 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3092 } else {
3093 brw_inst_set_dp_msg_type(devinfo, insn,
3094 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3095 }
3096
3097 } else {
3098 brw_inst_set_dp_msg_type(devinfo, insn,
3099 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3100
3101 if ((brw_get_default_group(p) / 8) % 2 == 1)
3102 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3103 }
3104
3105 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3106 }
3107
3108 void
3109 brw_typed_atomic(struct brw_codegen *p,
3110 struct brw_reg dst,
3111 struct brw_reg payload,
3112 struct brw_reg surface,
3113 unsigned atomic_op,
3114 unsigned msg_length,
3115 bool response_expected,
3116 bool header_present) {
3117 const struct gen_device_info *devinfo = p->devinfo;
3118 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3119 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3120 GEN6_SFID_DATAPORT_RENDER_CACHE);
3121 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3122 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3123 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3124 struct brw_inst *insn = brw_send_indirect_surface_message(
3125 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3126 brw_surface_payload_size(p, response_expected,
3127 devinfo->gen >= 8 || devinfo->is_haswell, false),
3128 header_present);
3129
3130 brw_set_dp_typed_atomic_message(
3131 p, insn, atomic_op, response_expected);
3132 }
3133
3134 static void
3135 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3136 struct brw_inst *insn,
3137 unsigned num_channels)
3138 {
3139 const struct gen_device_info *devinfo = p->devinfo;
3140 /* Set mask of unused channels. */
3141 unsigned msg_control = 0xf & (0xf << num_channels);
3142
3143 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3144 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3145 if ((brw_get_default_group(p) / 8) % 2 == 1)
3146 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3147 else
3148 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3149 }
3150
3151 brw_inst_set_dp_msg_type(devinfo, insn,
3152 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3153 } else {
3154 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3155 if ((brw_get_default_group(p) / 8) % 2 == 1)
3156 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3157 }
3158
3159 brw_inst_set_dp_msg_type(devinfo, insn,
3160 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3161 }
3162
3163 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3164 }
3165
3166 void
3167 brw_typed_surface_read(struct brw_codegen *p,
3168 struct brw_reg dst,
3169 struct brw_reg payload,
3170 struct brw_reg surface,
3171 unsigned msg_length,
3172 unsigned num_channels,
3173 bool header_present)
3174 {
3175 const struct gen_device_info *devinfo = p->devinfo;
3176 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3177 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3178 GEN6_SFID_DATAPORT_RENDER_CACHE);
3179 struct brw_inst *insn = brw_send_indirect_surface_message(
3180 p, sfid, dst, payload, surface, msg_length,
3181 brw_surface_payload_size(p, num_channels,
3182 devinfo->gen >= 8 || devinfo->is_haswell, false),
3183 header_present);
3184
3185 brw_set_dp_typed_surface_read_message(
3186 p, insn, num_channels);
3187 }
3188
3189 static void
3190 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3191 struct brw_inst *insn,
3192 unsigned num_channels)
3193 {
3194 const struct gen_device_info *devinfo = p->devinfo;
3195 /* Set mask of unused channels. */
3196 unsigned msg_control = 0xf & (0xf << num_channels);
3197
3198 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3199 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3200 if ((brw_get_default_group(p) / 8) % 2 == 1)
3201 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3202 else
3203 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3204 }
3205
3206 brw_inst_set_dp_msg_type(devinfo, insn,
3207 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3208
3209 } else {
3210 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3211 if ((brw_get_default_group(p) / 8) % 2 == 1)
3212 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3213 }
3214
3215 brw_inst_set_dp_msg_type(devinfo, insn,
3216 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3217 }
3218
3219 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3220 }
3221
3222 void
3223 brw_typed_surface_write(struct brw_codegen *p,
3224 struct brw_reg payload,
3225 struct brw_reg surface,
3226 unsigned msg_length,
3227 unsigned num_channels,
3228 bool header_present)
3229 {
3230 const struct gen_device_info *devinfo = p->devinfo;
3231 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3232 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3233 GEN6_SFID_DATAPORT_RENDER_CACHE);
3234 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3235 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3236 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3237 WRITEMASK_X : WRITEMASK_XYZW);
3238 struct brw_inst *insn = brw_send_indirect_surface_message(
3239 p, sfid, brw_writemask(brw_null_reg(), mask),
3240 payload, surface, msg_length, 0, header_present);
3241
3242 brw_set_dp_typed_surface_write_message(
3243 p, insn, num_channels);
3244 }
3245
3246 static void
3247 brw_set_memory_fence_message(struct brw_codegen *p,
3248 struct brw_inst *insn,
3249 enum brw_message_target sfid,
3250 bool commit_enable)
3251 {
3252 const struct gen_device_info *devinfo = p->devinfo;
3253
3254 brw_set_desc(p, insn, brw_message_desc(
3255 devinfo, 1, (commit_enable ? 1 : 0), true));
3256
3257 brw_inst_set_sfid(devinfo, insn, sfid);
3258
3259 switch (sfid) {
3260 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3261 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3262 break;
3263 case GEN7_SFID_DATAPORT_DATA_CACHE:
3264 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3265 break;
3266 default:
3267 unreachable("Not reached");
3268 }
3269
3270 if (commit_enable)
3271 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3272 }
3273
3274 void
3275 brw_memory_fence(struct brw_codegen *p,
3276 struct brw_reg dst,
3277 enum opcode send_op)
3278 {
3279 const struct gen_device_info *devinfo = p->devinfo;
3280 const bool commit_enable =
3281 devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3282 (devinfo->gen == 7 && !devinfo->is_haswell);
3283 struct brw_inst *insn;
3284
3285 brw_push_insn_state(p);
3286 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3287 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3288 dst = vec1(dst);
3289
3290 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3291 * message doesn't write anything back.
3292 */
3293 insn = next_insn(p, send_op);
3294 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3295 brw_set_dest(p, insn, dst);
3296 brw_set_src0(p, insn, dst);
3297 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3298 commit_enable);
3299
3300 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3301 /* IVB does typed surface access through the render cache, so we need to
3302 * flush it too. Use a different register so both flushes can be
3303 * pipelined by the hardware.
3304 */
3305 insn = next_insn(p, send_op);
3306 brw_set_dest(p, insn, offset(dst, 1));
3307 brw_set_src0(p, insn, offset(dst, 1));
3308 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3309 commit_enable);
3310
3311 /* Now write the response of the second message into the response of the
3312 * first to trigger a pipeline stall -- This way future render and data
3313 * cache messages will be properly ordered with respect to past data and
3314 * render cache messages.
3315 */
3316 brw_MOV(p, dst, offset(dst, 1));
3317 }
3318
3319 brw_pop_insn_state(p);
3320 }
3321
3322 void
3323 brw_pixel_interpolator_query(struct brw_codegen *p,
3324 struct brw_reg dest,
3325 struct brw_reg mrf,
3326 bool noperspective,
3327 unsigned mode,
3328 struct brw_reg data,
3329 unsigned msg_length,
3330 unsigned response_length)
3331 {
3332 const struct gen_device_info *devinfo = p->devinfo;
3333 struct brw_inst *insn;
3334 const uint16_t exec_size = brw_get_default_exec_size(p);
3335 const uint16_t qtr_ctrl = brw_get_default_group(p) / 8;
3336
3337 /* brw_send_indirect_message will automatically use a direct send message
3338 * if data is actually immediate.
3339 */
3340 insn = brw_send_indirect_message(p,
3341 GEN7_SFID_PIXEL_INTERPOLATOR,
3342 dest,
3343 mrf,
3344 vec1(data), 0);
3345 brw_inst_set_mlen(devinfo, insn, msg_length);
3346 brw_inst_set_rlen(devinfo, insn, response_length);
3347
3348 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3349 brw_inst_set_pi_slot_group(devinfo, insn, qtr_ctrl / 2);
3350 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3351 brw_inst_set_pi_message_type(devinfo, insn, mode);
3352 }
3353
3354 void
3355 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3356 struct brw_reg mask)
3357 {
3358 const struct gen_device_info *devinfo = p->devinfo;
3359 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3360 const unsigned qtr_control = brw_get_default_group(p) / 8;
3361 brw_inst *inst;
3362
3363 assert(devinfo->gen >= 7);
3364 assert(mask.type == BRW_REGISTER_TYPE_UD);
3365
3366 brw_push_insn_state(p);
3367
3368 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3369 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3370
3371 if (devinfo->gen >= 8) {
3372 /* Getting the first active channel index is easy on Gen8: Just find
3373 * the first bit set in the execution mask. The register exists on
3374 * HSW already but it reads back as all ones when the current
3375 * instruction has execution masking disabled, so it's kind of
3376 * useless.
3377 */
3378 struct brw_reg exec_mask =
3379 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3380
3381 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3382 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3383 /* Unfortunately, ce0 does not take into account the thread
3384 * dispatch mask, which may be a problem in cases where it's not
3385 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3386 * some n). Combine ce0 with the given dispatch (or vector) mask
3387 * to mask off those channels which were never dispatched by the
3388 * hardware.
3389 */
3390 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3391 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3392 exec_mask = vec1(dst);
3393 }
3394
3395 /* Quarter control has the effect of magically shifting the value of
3396 * ce0 so you'll get the first active channel relative to the
3397 * specified quarter control as result.
3398 */
3399 inst = brw_FBL(p, vec1(dst), exec_mask);
3400 } else {
3401 const struct brw_reg flag = brw_flag_reg(p->current->flag_subreg / 2,
3402 p->current->flag_subreg % 2);
3403
3404 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3405 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3406
3407 /* Run enough instructions returning zero with execution masking and
3408 * a conditional modifier enabled in order to get the full execution
3409 * mask in f1.0. We could use a single 32-wide move here if it
3410 * weren't because of the hardware bug that causes channel enables to
3411 * be applied incorrectly to the second half of 32-wide instructions
3412 * on Gen7.
3413 */
3414 const unsigned lower_size = MIN2(16, exec_size);
3415 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3416 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3417 brw_imm_uw(0));
3418 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3419 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3420 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3421 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3422 }
3423
3424 /* Find the first bit set in the exec_size-wide portion of the flag
3425 * register that was updated by the last sequence of MOV
3426 * instructions.
3427 */
3428 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3429 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3430 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3431 }
3432 } else {
3433 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3434
3435 if (devinfo->gen >= 8 &&
3436 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3437 /* In SIMD4x2 mode the first active channel index is just the
3438 * negation of the first bit of the mask register. Note that ce0
3439 * doesn't take into account the dispatch mask, so the Gen7 path
3440 * should be used instead unless you have the guarantee that the
3441 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3442 * for some n).
3443 */
3444 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3445 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3446 brw_imm_ud(1));
3447
3448 } else {
3449 /* Overwrite the destination without and with execution masking to
3450 * find out which of the channels is active.
3451 */
3452 brw_push_insn_state(p);
3453 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3454 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3455 brw_imm_ud(1));
3456
3457 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3458 brw_imm_ud(0));
3459 brw_pop_insn_state(p);
3460 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3461 }
3462 }
3463
3464 brw_pop_insn_state(p);
3465 }
3466
3467 void
3468 brw_broadcast(struct brw_codegen *p,
3469 struct brw_reg dst,
3470 struct brw_reg src,
3471 struct brw_reg idx)
3472 {
3473 const struct gen_device_info *devinfo = p->devinfo;
3474 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3475 brw_inst *inst;
3476
3477 brw_push_insn_state(p);
3478 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3479 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3480
3481 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3482 src.address_mode == BRW_ADDRESS_DIRECT);
3483 assert(!src.abs && !src.negate);
3484 assert(src.type == dst.type);
3485
3486 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3487 idx.file == BRW_IMMEDIATE_VALUE) {
3488 /* Trivial, the source is already uniform or the index is a constant.
3489 * We will typically not get here if the optimizer is doing its job, but
3490 * asserting would be mean.
3491 */
3492 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3493 brw_MOV(p, dst,
3494 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3495 stride(suboffset(src, 4 * i), 0, 4, 1)));
3496 } else {
3497 /* From the Haswell PRM section "Register Region Restrictions":
3498 *
3499 * "The lower bits of the AddressImmediate must not overflow to
3500 * change the register address. The lower 5 bits of Address
3501 * Immediate when added to lower 5 bits of address register gives
3502 * the sub-register offset. The upper bits of Address Immediate
3503 * when added to upper bits of address register gives the register
3504 * address. Any overflow from sub-register offset is dropped."
3505 *
3506 * Fortunately, for broadcast, we never have a sub-register offset so
3507 * this isn't an issue.
3508 */
3509 assert(src.subnr == 0);
3510
3511 if (align1) {
3512 const struct brw_reg addr =
3513 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3514 unsigned offset = src.nr * REG_SIZE + src.subnr;
3515 /* Limit in bytes of the signed indirect addressing immediate. */
3516 const unsigned limit = 512;
3517
3518 brw_push_insn_state(p);
3519 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3520 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3521
3522 /* Take into account the component size and horizontal stride. */
3523 assert(src.vstride == src.hstride + src.width);
3524 brw_SHL(p, addr, vec1(idx),
3525 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3526 src.hstride - 1));
3527
3528 /* We can only address up to limit bytes using the indirect
3529 * addressing immediate, account for the difference if the source
3530 * register is above this limit.
3531 */
3532 if (offset >= limit) {
3533 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3534 offset = offset % limit;
3535 }
3536
3537 brw_pop_insn_state(p);
3538
3539 /* Use indirect addressing to fetch the specified component. */
3540 if (type_sz(src.type) > 4 &&
3541 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3542 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3543 *
3544 * "When source or destination datatype is 64b or operation is
3545 * integer DWord multiply, indirect addressing must not be
3546 * used."
3547 *
3548 * To work around both of this issue, we do two integer MOVs
3549 * insead of one 64-bit MOV. Because no double value should ever
3550 * cross a register boundary, it's safe to use the immediate
3551 * offset in the indirect here to handle adding 4 bytes to the
3552 * offset and avoid the extra ADD to the register file.
3553 */
3554 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3555 retype(brw_vec1_indirect(addr.subnr, offset),
3556 BRW_REGISTER_TYPE_D));
3557 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3558 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3559 BRW_REGISTER_TYPE_D));
3560 } else {
3561 brw_MOV(p, dst,
3562 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3563 }
3564 } else {
3565 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3566 * to all bits of a flag register,
3567 */
3568 inst = brw_MOV(p,
3569 brw_null_reg(),
3570 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3571 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3572 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3573 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3574
3575 /* and use predicated SEL to pick the right channel. */
3576 inst = brw_SEL(p, dst,
3577 stride(suboffset(src, 4), 4, 4, 1),
3578 stride(src, 4, 4, 1));
3579 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3580 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3581 }
3582 }
3583
3584 brw_pop_insn_state(p);
3585 }
3586
3587 /**
3588 * This instruction is generated as a single-channel align1 instruction by
3589 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3590 *
3591 * We can't use the typed atomic op in the FS because that has the execution
3592 * mask ANDed with the pixel mask, but we just want to write the one dword for
3593 * all the pixels.
3594 *
3595 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3596 * one u32. So we use the same untyped atomic write message as the pixel
3597 * shader.
3598 *
3599 * The untyped atomic operation requires a BUFFER surface type with RAW
3600 * format, and is only accessible through the legacy DATA_CACHE dataport
3601 * messages.
3602 */
3603 void brw_shader_time_add(struct brw_codegen *p,
3604 struct brw_reg payload,
3605 uint32_t surf_index)
3606 {
3607 const struct gen_device_info *devinfo = p->devinfo;
3608 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3609 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3610 GEN7_SFID_DATAPORT_DATA_CACHE);
3611 assert(devinfo->gen >= 7);
3612
3613 brw_push_insn_state(p);
3614 brw_set_default_access_mode(p, BRW_ALIGN_1);
3615 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3616 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3617 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3618
3619 /* We use brw_vec1_reg and unmasked because we want to increment the given
3620 * offset only once.
3621 */
3622 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3623 BRW_ARF_NULL, 0));
3624 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3625 payload.nr, 0));
3626 brw_set_src1(p, send, brw_imm_ud(0));
3627 brw_set_desc(p, send, brw_message_desc(devinfo, 2, 0, false));
3628 brw_inst_set_sfid(devinfo, send, sfid);
3629 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3630 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3631
3632 brw_pop_insn_state(p);
3633 }
3634
3635
3636 /**
3637 * Emit the SEND message for a barrier
3638 */
3639 void
3640 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3641 {
3642 const struct gen_device_info *devinfo = p->devinfo;
3643 struct brw_inst *inst;
3644
3645 assert(devinfo->gen >= 7);
3646
3647 brw_push_insn_state(p);
3648 brw_set_default_access_mode(p, BRW_ALIGN_1);
3649 inst = next_insn(p, BRW_OPCODE_SEND);
3650 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3651 brw_set_src0(p, inst, src);
3652 brw_set_src1(p, inst, brw_null_reg());
3653 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3654
3655 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3656 brw_inst_set_gateway_notify(devinfo, inst, 1);
3657 brw_inst_set_gateway_subfuncid(devinfo, inst,
3658 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3659
3660 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3661 brw_pop_insn_state(p);
3662 }
3663
3664
3665 /**
3666 * Emit the wait instruction for a barrier
3667 */
3668 void
3669 brw_WAIT(struct brw_codegen *p)
3670 {
3671 const struct gen_device_info *devinfo = p->devinfo;
3672 struct brw_inst *insn;
3673
3674 struct brw_reg src = brw_notification_reg();
3675
3676 insn = next_insn(p, BRW_OPCODE_WAIT);
3677 brw_set_dest(p, insn, src);
3678 brw_set_src0(p, insn, src);
3679 brw_set_src1(p, insn, brw_null_reg());
3680
3681 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3682 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3683 }
3684
3685 /**
3686 * Changes the floating point rounding mode updating the control register
3687 * field defined at cr0.0[5-6] bits. This function supports the changes to
3688 * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3689 * Only RTNE and RTZ rounding are enabled at nir.
3690 */
3691 void
3692 brw_rounding_mode(struct brw_codegen *p,
3693 enum brw_rnd_mode mode)
3694 {
3695 const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
3696
3697 if (bits != BRW_CR0_RND_MODE_MASK) {
3698 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3699 brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
3700 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3701
3702 /* From the Skylake PRM, Volume 7, page 760:
3703 * "Implementation Restriction on Register Access: When the control
3704 * register is used as an explicit source and/or destination, hardware
3705 * does not ensure execution pipeline coherency. Software must set the
3706 * thread control field to ‘switch’ for an instruction that uses
3707 * control register as an explicit operand."
3708 */
3709 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3710 }
3711
3712 if (bits) {
3713 brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3714 brw_imm_ud(bits));
3715 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3716 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3717 }
3718 }