intel/compiler: remove check unsigned is >= 0
[mesa.git] / src / intel / compiler / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 brw_push_insn_state(p);
59 brw_set_default_exec_size(p, BRW_EXECUTE_8);
60 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 retype(*src, BRW_REGISTER_TYPE_UD));
64 brw_pop_insn_state(p);
65 }
66 *src = brw_message_reg(msg_reg_nr);
67 }
68
69 static void
70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct gen_device_info *devinfo = p->devinfo;
81 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82 reg->file = BRW_GENERAL_REGISTER_FILE;
83 reg->nr += GEN7_MRF_HACK_START;
84 }
85 }
86
87 /**
88 * Convert a brw_reg_type enumeration value into the hardware representation.
89 *
90 * The hardware encoding may depend on whether the value is an immediate.
91 */
92 unsigned
93 brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
94 enum brw_reg_type type, enum brw_reg_file file)
95 {
96 if (file == BRW_IMMEDIATE_VALUE) {
97 static const int imm_hw_types[] = {
98 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
99 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
100 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
101 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
102 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
103 [BRW_REGISTER_TYPE_UB] = -1,
104 [BRW_REGISTER_TYPE_B] = -1,
105 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
106 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
107 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
108 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
109 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
110 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
111 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
112 };
113 assert(type < ARRAY_SIZE(imm_hw_types));
114 assert(imm_hw_types[type] != -1);
115 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
116 return imm_hw_types[type];
117 } else {
118 /* Non-immediate registers */
119 static const int hw_types[] = {
120 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
121 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
122 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
123 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
124 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
125 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
126 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
127 [BRW_REGISTER_TYPE_UV] = -1,
128 [BRW_REGISTER_TYPE_VF] = -1,
129 [BRW_REGISTER_TYPE_V] = -1,
130 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
131 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
132 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
133 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
134 };
135 assert(type < ARRAY_SIZE(hw_types));
136 assert(hw_types[type] != -1);
137 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
138 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_Q);
139 return hw_types[type];
140 }
141 }
142
143 /**
144 * Return the element size given a hardware register type and file.
145 *
146 * The hardware encoding may depend on whether the value is an immediate.
147 */
148 unsigned
149 brw_hw_reg_type_to_size(const struct gen_device_info *devinfo,
150 unsigned type, enum brw_reg_file file)
151 {
152 if (file == BRW_IMMEDIATE_VALUE) {
153 static const unsigned imm_hw_sizes[] = {
154 [BRW_HW_REG_TYPE_UD] = 4,
155 [BRW_HW_REG_TYPE_D] = 4,
156 [BRW_HW_REG_TYPE_UW] = 2,
157 [BRW_HW_REG_TYPE_W] = 2,
158 [BRW_HW_REG_IMM_TYPE_UV] = 2,
159 [BRW_HW_REG_IMM_TYPE_VF] = 4,
160 [BRW_HW_REG_IMM_TYPE_V] = 2,
161 [BRW_HW_REG_TYPE_F] = 4,
162 [GEN8_HW_REG_TYPE_UQ] = 8,
163 [GEN8_HW_REG_TYPE_Q] = 8,
164 [GEN8_HW_REG_IMM_TYPE_DF] = 8,
165 [GEN8_HW_REG_IMM_TYPE_HF] = 2,
166 };
167 assert(type < ARRAY_SIZE(imm_hw_sizes));
168 assert(devinfo->gen >= 6 || type != BRW_HW_REG_IMM_TYPE_UV);
169 assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
170 return imm_hw_sizes[type];
171 } else {
172 /* Non-immediate registers */
173 static const unsigned hw_sizes[] = {
174 [BRW_HW_REG_TYPE_UD] = 4,
175 [BRW_HW_REG_TYPE_D] = 4,
176 [BRW_HW_REG_TYPE_UW] = 2,
177 [BRW_HW_REG_TYPE_W] = 2,
178 [BRW_HW_REG_NON_IMM_TYPE_UB] = 1,
179 [BRW_HW_REG_NON_IMM_TYPE_B] = 1,
180 [GEN7_HW_REG_NON_IMM_TYPE_DF] = 8,
181 [BRW_HW_REG_TYPE_F] = 4,
182 [GEN8_HW_REG_TYPE_UQ] = 8,
183 [GEN8_HW_REG_TYPE_Q] = 8,
184 [GEN8_HW_REG_NON_IMM_TYPE_HF] = 2,
185 };
186 assert(type < ARRAY_SIZE(hw_sizes));
187 assert(devinfo->gen >= 7 ||
188 (type < GEN7_HW_REG_NON_IMM_TYPE_DF || type == BRW_HW_REG_TYPE_F));
189 assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
190 return hw_sizes[type];
191 }
192 }
193
194 void
195 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
196 {
197 const struct gen_device_info *devinfo = p->devinfo;
198
199 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
200 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
201 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
202 assert(dest.nr < 128);
203
204 gen7_convert_mrf_to_grf(p, &dest);
205
206 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
207 brw_inst_set_dst_reg_type(devinfo, inst,
208 brw_reg_type_to_hw_type(devinfo, dest.type,
209 dest.file));
210 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
211
212 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
213 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
214
215 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
216 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
217 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
218 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
219 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
220 } else {
221 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
222 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
223 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
224 dest.file == BRW_MESSAGE_REGISTER_FILE) {
225 assert(dest.writemask != 0);
226 }
227 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
228 * Although Dst.HorzStride is a don't care for Align16, HW needs
229 * this to be programmed as "01".
230 */
231 brw_inst_set_dst_hstride(devinfo, inst, 1);
232 }
233 } else {
234 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
235
236 /* These are different sizes in align1 vs align16:
237 */
238 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
239 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
240 dest.indirect_offset);
241 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
242 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
243 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
244 } else {
245 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
246 dest.indirect_offset);
247 /* even ignored in da16, still need to set as '01' */
248 brw_inst_set_dst_hstride(devinfo, inst, 1);
249 }
250 }
251
252 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
253 * or 16 (SIMD16), as that's normally correct. However, when dealing with
254 * small registers, we automatically reduce it to match the register size.
255 *
256 * In platforms that support fp64 we can emit instructions with a width of
257 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
258 * cases we need to make sure that these instructions have their exec sizes
259 * set properly when they are emitted and we can't rely on this code to fix
260 * it.
261 */
262 bool fix_exec_size;
263 if (devinfo->gen >= 6)
264 fix_exec_size = dest.width < BRW_EXECUTE_4;
265 else
266 fix_exec_size = dest.width < BRW_EXECUTE_8;
267
268 if (fix_exec_size)
269 brw_inst_set_exec_size(devinfo, inst, dest.width);
270 }
271
272 static void
273 validate_reg(const struct gen_device_info *devinfo,
274 brw_inst *inst, struct brw_reg reg)
275 {
276 const int hstride_for_reg[] = {0, 1, 2, 4};
277 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
278 const int width_for_reg[] = {1, 2, 4, 8, 16};
279 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
280 int width, hstride, vstride, execsize;
281
282 if (reg.file == BRW_IMMEDIATE_VALUE) {
283 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
284 * mean the destination has to be 128-bit aligned and the
285 * destination horiz stride has to be a word.
286 */
287 if (reg.type == BRW_REGISTER_TYPE_V) {
288 unsigned UNUSED elem_size = brw_element_size(devinfo, inst, dst);
289 assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
290 elem_size == 2);
291 }
292
293 return;
294 }
295
296 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
297 reg.file == BRW_ARF_NULL)
298 return;
299
300 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
301 *
302 * "Swizzling is not allowed when an accumulator is used as an implicit
303 * source or an explicit source in an instruction."
304 */
305 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
306 reg.nr == BRW_ARF_ACCUMULATOR)
307 assert(reg.swizzle == BRW_SWIZZLE_XYZW);
308
309 assert(reg.hstride < ARRAY_SIZE(hstride_for_reg));
310 hstride = hstride_for_reg[reg.hstride];
311
312 if (reg.vstride == 0xf) {
313 vstride = -1;
314 } else {
315 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
316 vstride = vstride_for_reg[reg.vstride];
317 }
318
319 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
320 width = width_for_reg[reg.width];
321
322 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
323 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
324 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
325
326 /* Restrictions from 3.3.10: Register Region Restrictions. */
327 /* 3. */
328 assert(execsize >= width);
329
330 /* 4. */
331 if (execsize == width && hstride != 0) {
332 assert(vstride == -1 || vstride == width * hstride);
333 }
334
335 /* 5. */
336 if (execsize == width && hstride == 0) {
337 /* no restriction on vstride. */
338 }
339
340 /* 6. */
341 if (width == 1) {
342 assert(hstride == 0);
343 }
344
345 /* 7. */
346 if (execsize == 1 && width == 1) {
347 assert(hstride == 0);
348 assert(vstride == 0);
349 }
350
351 /* 8. */
352 if (vstride == 0 && hstride == 0) {
353 assert(width == 1);
354 }
355
356 /* 10. Check destination issues. */
357 }
358
359 static bool
360 is_compactable_immediate(unsigned imm)
361 {
362 /* We get the low 12 bits as-is. */
363 imm &= ~0xfff;
364
365 /* We get one bit replicated through the top 20 bits. */
366 return imm == 0 || imm == 0xfffff000;
367 }
368
369 void
370 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
371 {
372 const struct gen_device_info *devinfo = p->devinfo;
373
374 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
375 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
376 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
377 assert(reg.nr < 128);
378
379 gen7_convert_mrf_to_grf(p, &reg);
380
381 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
382 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
383 /* Any source modifiers or regions will be ignored, since this just
384 * identifies the MRF/GRF to start reading the message contents from.
385 * Check for some likely failures.
386 */
387 assert(!reg.negate);
388 assert(!reg.abs);
389 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
390 }
391
392 validate_reg(devinfo, inst, reg);
393
394 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
395 brw_inst_set_src0_reg_type(devinfo, inst,
396 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
397 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
398 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
399 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
400
401 if (reg.file == BRW_IMMEDIATE_VALUE) {
402 if (reg.type == BRW_REGISTER_TYPE_DF ||
403 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
404 brw_inst_set_imm_df(devinfo, inst, reg.df);
405 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
406 reg.type == BRW_REGISTER_TYPE_Q)
407 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
408 else
409 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
410
411 /* The Bspec's section titled "Non-present Operands" claims that if src0
412 * is an immediate that src1's type must be the same as that of src0.
413 *
414 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
415 * that do not follow this rule. E.g., from the IVB/HSW table:
416 *
417 * DataTypeIndex 18-Bit Mapping Mapped Meaning
418 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
419 *
420 * And from the SNB table:
421 *
422 * DataTypeIndex 18-Bit Mapping Mapped Meaning
423 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
424 *
425 * Neither of these cause warnings from the simulator when used,
426 * compacted or otherwise. In fact, all compaction mappings that have an
427 * immediate in src0 use a:ud for src1.
428 *
429 * The GM45 instruction compaction tables do not contain mapped meanings
430 * so it's not clear whether it has the restriction. We'll assume it was
431 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
432 *
433 * Don't do any of this for 64-bit immediates, since the src1 fields
434 * overlap with the immediate and setting them would overwrite the
435 * immediate we set.
436 */
437 if (type_sz(reg.type) < 8) {
438 brw_inst_set_src1_reg_file(devinfo, inst,
439 BRW_ARCHITECTURE_REGISTER_FILE);
440 if (devinfo->gen < 6) {
441 brw_inst_set_src1_reg_type(devinfo, inst,
442 brw_inst_src0_reg_type(devinfo, inst));
443 } else {
444 brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
445 }
446 }
447
448 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
449 * for immediate values. Presumably the hardware engineers realized
450 * that the only useful floating-point value that could be represented
451 * in this format is 0.0, which can also be represented as a VF-typed
452 * immediate, so they gave us the previously mentioned mapping on IVB+.
453 *
454 * Strangely, we do have a mapping for imm:f in src1, so we don't need
455 * to do this there.
456 *
457 * If we see a 0.0:F, change the type to VF so that it can be compacted.
458 */
459 if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
460 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F &&
461 brw_inst_dst_reg_type(devinfo, inst) != GEN7_HW_REG_NON_IMM_TYPE_DF) {
462 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
463 }
464
465 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
466 * set the types to :UD so the instruction can be compacted.
467 */
468 if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
469 brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
470 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
471 brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
472 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
473 brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
474 }
475 } else {
476 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
477 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
478 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
479 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
480 } else {
481 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
482 }
483 } else {
484 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
485
486 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
487 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
488 } else {
489 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
490 }
491 }
492
493 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
494 if (reg.width == BRW_WIDTH_1 &&
495 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
496 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
497 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
498 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
499 } else {
500 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
501 brw_inst_set_src0_width(devinfo, inst, reg.width);
502 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
503 }
504 } else {
505 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
506 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
507 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
508 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
509 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
510 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
511 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
512 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
513
514 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
515 /* This is an oddity of the fact we're using the same
516 * descriptions for registers in align_16 as align_1:
517 */
518 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
519 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
520 reg.type == BRW_REGISTER_TYPE_DF &&
521 reg.vstride == BRW_VERTICAL_STRIDE_2) {
522 /* From SNB PRM:
523 *
524 * "For Align16 access mode, only encodings of 0000 and 0011
525 * are allowed. Other codes are reserved."
526 *
527 * Presumably the DevSNB behavior applies to IVB as well.
528 */
529 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
530 } else {
531 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
532 }
533 }
534 }
535 }
536
537
538 void
539 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
540 {
541 const struct gen_device_info *devinfo = p->devinfo;
542
543 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
544 assert(reg.nr < 128);
545
546 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
547 *
548 * "Accumulator registers may be accessed explicitly as src0
549 * operands only."
550 */
551 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
552 reg.nr != BRW_ARF_ACCUMULATOR);
553
554 gen7_convert_mrf_to_grf(p, &reg);
555 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
556
557 validate_reg(devinfo, inst, reg);
558
559 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
560 brw_inst_set_src1_reg_type(devinfo, inst,
561 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
562 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
563 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
564
565 /* Only src1 can be immediate in two-argument instructions.
566 */
567 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
568
569 if (reg.file == BRW_IMMEDIATE_VALUE) {
570 /* two-argument instructions can only use 32-bit immediates */
571 assert(type_sz(reg.type) < 8);
572 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
573 } else {
574 /* This is a hardware restriction, which may or may not be lifted
575 * in the future:
576 */
577 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
578 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
579
580 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
581 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
582 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
583 } else {
584 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
585 }
586
587 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
588 if (reg.width == BRW_WIDTH_1 &&
589 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
590 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
591 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
592 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
593 } else {
594 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
595 brw_inst_set_src1_width(devinfo, inst, reg.width);
596 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
597 }
598 } else {
599 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
600 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
601 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
602 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
603 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
604 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
605 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
606 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
607
608 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
609 /* This is an oddity of the fact we're using the same
610 * descriptions for registers in align_16 as align_1:
611 */
612 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
613 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
614 reg.type == BRW_REGISTER_TYPE_DF &&
615 reg.vstride == BRW_VERTICAL_STRIDE_2) {
616 /* From SNB PRM:
617 *
618 * "For Align16 access mode, only encodings of 0000 and 0011
619 * are allowed. Other codes are reserved."
620 *
621 * Presumably the DevSNB behavior applies to IVB as well.
622 */
623 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
624 } else {
625 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
626 }
627 }
628 }
629 }
630
631 /**
632 * Set the Message Descriptor and Extended Message Descriptor fields
633 * for SEND messages.
634 *
635 * \note This zeroes out the Function Control bits, so it must be called
636 * \b before filling out any message-specific data. Callers can
637 * choose not to fill in irrelevant bits; they will be zero.
638 */
639 void
640 brw_set_message_descriptor(struct brw_codegen *p,
641 brw_inst *inst,
642 enum brw_message_target sfid,
643 unsigned msg_length,
644 unsigned response_length,
645 bool header_present,
646 bool end_of_thread)
647 {
648 const struct gen_device_info *devinfo = p->devinfo;
649
650 brw_set_src1(p, inst, brw_imm_d(0));
651
652 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
653 * itself; instead, it will be a MOV/OR into the address register.
654 *
655 * In this case, we avoid setting the extended message descriptor bits,
656 * since they go on the later SEND/SENDC instead and if set here would
657 * instead clobber the conditionalmod bits.
658 */
659 unsigned opcode = brw_inst_opcode(devinfo, inst);
660 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
661 brw_inst_set_sfid(devinfo, inst, sfid);
662 }
663
664 brw_inst_set_mlen(devinfo, inst, msg_length);
665 brw_inst_set_rlen(devinfo, inst, response_length);
666 brw_inst_set_eot(devinfo, inst, end_of_thread);
667
668 if (devinfo->gen >= 5) {
669 brw_inst_set_header_present(devinfo, inst, header_present);
670 }
671 }
672
673 static void brw_set_math_message( struct brw_codegen *p,
674 brw_inst *inst,
675 unsigned function,
676 unsigned integer_type,
677 bool low_precision,
678 unsigned dataType )
679 {
680 const struct gen_device_info *devinfo = p->devinfo;
681 unsigned msg_length;
682 unsigned response_length;
683
684 /* Infer message length from the function */
685 switch (function) {
686 case BRW_MATH_FUNCTION_POW:
687 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
688 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
689 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
690 msg_length = 2;
691 break;
692 default:
693 msg_length = 1;
694 break;
695 }
696
697 /* Infer response length from the function */
698 switch (function) {
699 case BRW_MATH_FUNCTION_SINCOS:
700 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
701 response_length = 2;
702 break;
703 default:
704 response_length = 1;
705 break;
706 }
707
708
709 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
710 msg_length, response_length, false, false);
711 brw_inst_set_math_msg_function(devinfo, inst, function);
712 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
713 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
714 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
715 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
716 brw_inst_set_saturate(devinfo, inst, 0);
717 }
718
719
720 static void brw_set_ff_sync_message(struct brw_codegen *p,
721 brw_inst *insn,
722 bool allocate,
723 unsigned response_length,
724 bool end_of_thread)
725 {
726 const struct gen_device_info *devinfo = p->devinfo;
727
728 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
729 1, response_length, true, end_of_thread);
730 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
731 brw_inst_set_urb_allocate(devinfo, insn, allocate);
732 /* The following fields are not used by FF_SYNC: */
733 brw_inst_set_urb_global_offset(devinfo, insn, 0);
734 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
735 brw_inst_set_urb_used(devinfo, insn, 0);
736 brw_inst_set_urb_complete(devinfo, insn, 0);
737 }
738
739 static void brw_set_urb_message( struct brw_codegen *p,
740 brw_inst *insn,
741 enum brw_urb_write_flags flags,
742 unsigned msg_length,
743 unsigned response_length,
744 unsigned offset,
745 unsigned swizzle_control )
746 {
747 const struct gen_device_info *devinfo = p->devinfo;
748
749 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
750 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
751 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
752
753 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
754 msg_length, response_length, true,
755 flags & BRW_URB_WRITE_EOT);
756
757 if (flags & BRW_URB_WRITE_OWORD) {
758 assert(msg_length == 2); /* header + one OWORD of data */
759 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
760 } else {
761 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
762 }
763
764 brw_inst_set_urb_global_offset(devinfo, insn, offset);
765 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
766
767 if (devinfo->gen < 8) {
768 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
769 }
770
771 if (devinfo->gen < 7) {
772 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
773 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
774 } else {
775 brw_inst_set_urb_per_slot_offset(devinfo, insn,
776 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
777 }
778 }
779
780 void
781 brw_set_dp_write_message(struct brw_codegen *p,
782 brw_inst *insn,
783 unsigned binding_table_index,
784 unsigned msg_control,
785 unsigned msg_type,
786 unsigned target_cache,
787 unsigned msg_length,
788 bool header_present,
789 unsigned last_render_target,
790 unsigned response_length,
791 unsigned end_of_thread,
792 unsigned send_commit_msg)
793 {
794 const struct gen_device_info *devinfo = p->devinfo;
795 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
796 BRW_SFID_DATAPORT_WRITE);
797
798 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
799 header_present, end_of_thread);
800
801 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
802 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
803 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
804 brw_inst_set_rt_last(devinfo, insn, last_render_target);
805 if (devinfo->gen < 7) {
806 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
807 }
808 }
809
810 void
811 brw_set_dp_read_message(struct brw_codegen *p,
812 brw_inst *insn,
813 unsigned binding_table_index,
814 unsigned msg_control,
815 unsigned msg_type,
816 unsigned target_cache,
817 unsigned msg_length,
818 bool header_present,
819 unsigned response_length)
820 {
821 const struct gen_device_info *devinfo = p->devinfo;
822 const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
823 BRW_SFID_DATAPORT_READ);
824
825 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
826 header_present, false);
827
828 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
829 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
830 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
831 if (devinfo->gen < 6)
832 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
833 }
834
835 void
836 brw_set_sampler_message(struct brw_codegen *p,
837 brw_inst *inst,
838 unsigned binding_table_index,
839 unsigned sampler,
840 unsigned msg_type,
841 unsigned response_length,
842 unsigned msg_length,
843 unsigned header_present,
844 unsigned simd_mode,
845 unsigned return_format)
846 {
847 const struct gen_device_info *devinfo = p->devinfo;
848
849 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
850 response_length, header_present, false);
851
852 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
853 brw_inst_set_sampler(devinfo, inst, sampler);
854 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
855 if (devinfo->gen >= 5) {
856 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
857 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
858 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
859 }
860 }
861
862 static void
863 gen7_set_dp_scratch_message(struct brw_codegen *p,
864 brw_inst *inst,
865 bool write,
866 bool dword,
867 bool invalidate_after_read,
868 unsigned num_regs,
869 unsigned addr_offset,
870 unsigned mlen,
871 unsigned rlen,
872 bool header_present)
873 {
874 const struct gen_device_info *devinfo = p->devinfo;
875 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
876 (devinfo->gen >= 8 && num_regs == 8));
877 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
878 num_regs - 1);
879
880 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
881 mlen, rlen, header_present, false);
882 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
883 brw_inst_set_scratch_read_write(devinfo, inst, write);
884 brw_inst_set_scratch_type(devinfo, inst, dword);
885 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
886 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
887 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
888 }
889
890 #define next_insn brw_next_insn
891 brw_inst *
892 brw_next_insn(struct brw_codegen *p, unsigned opcode)
893 {
894 const struct gen_device_info *devinfo = p->devinfo;
895 brw_inst *insn;
896
897 if (p->nr_insn + 1 > p->store_size) {
898 p->store_size <<= 1;
899 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
900 }
901
902 p->next_insn_offset += 16;
903 insn = &p->store[p->nr_insn++];
904 memcpy(insn, p->current, sizeof(*insn));
905
906 brw_inst_set_opcode(devinfo, insn, opcode);
907 return insn;
908 }
909
910 static brw_inst *
911 brw_alu1(struct brw_codegen *p, unsigned opcode,
912 struct brw_reg dest, struct brw_reg src)
913 {
914 brw_inst *insn = next_insn(p, opcode);
915 brw_set_dest(p, insn, dest);
916 brw_set_src0(p, insn, src);
917 return insn;
918 }
919
920 static brw_inst *
921 brw_alu2(struct brw_codegen *p, unsigned opcode,
922 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
923 {
924 /* 64-bit immediates are only supported on 1-src instructions */
925 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
926 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
927
928 brw_inst *insn = next_insn(p, opcode);
929 brw_set_dest(p, insn, dest);
930 brw_set_src0(p, insn, src0);
931 brw_set_src1(p, insn, src1);
932 return insn;
933 }
934
935 static int
936 get_3src_subreg_nr(struct brw_reg reg)
937 {
938 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
939 * use 32-bit units (components 0..7). Since they only support F/D/UD
940 * types, this doesn't lose any flexibility, but uses fewer bits.
941 */
942 return reg.subnr / 4;
943 }
944
945 static brw_inst *
946 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
947 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
948 {
949 const struct gen_device_info *devinfo = p->devinfo;
950 brw_inst *inst = next_insn(p, opcode);
951
952 gen7_convert_mrf_to_grf(p, &dest);
953
954 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
955
956 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
957 dest.file == BRW_MESSAGE_REGISTER_FILE);
958 assert(dest.nr < 128);
959 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
960 assert(dest.type == BRW_REGISTER_TYPE_F ||
961 dest.type == BRW_REGISTER_TYPE_DF ||
962 dest.type == BRW_REGISTER_TYPE_D ||
963 dest.type == BRW_REGISTER_TYPE_UD);
964 if (devinfo->gen == 6) {
965 brw_inst_set_3src_dst_reg_file(devinfo, inst,
966 dest.file == BRW_MESSAGE_REGISTER_FILE);
967 }
968 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
969 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
970 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
971
972 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
973 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
974 assert(src0.nr < 128);
975 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
976 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
977 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
978 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
979 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
980 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
981 src0.vstride == BRW_VERTICAL_STRIDE_0);
982
983 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
984 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
985 assert(src1.nr < 128);
986 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
987 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
988 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
989 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
990 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
991 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
992 src1.vstride == BRW_VERTICAL_STRIDE_0);
993
994 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
995 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
996 assert(src2.nr < 128);
997 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
998 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
999 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
1000 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
1001 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
1002 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
1003 src2.vstride == BRW_VERTICAL_STRIDE_0);
1004
1005 if (devinfo->gen >= 7) {
1006 /* Set both the source and destination types based on dest.type,
1007 * ignoring the source register types. The MAD and LRP emitters ensure
1008 * that all four types are float. The BFE and BFI2 emitters, however,
1009 * may send us mixed D and UD types and want us to ignore that and use
1010 * the destination type.
1011 */
1012 switch (dest.type) {
1013 case BRW_REGISTER_TYPE_F:
1014 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
1015 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
1016 break;
1017 case BRW_REGISTER_TYPE_DF:
1018 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
1019 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
1020 break;
1021 case BRW_REGISTER_TYPE_D:
1022 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
1023 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
1024 break;
1025 case BRW_REGISTER_TYPE_UD:
1026 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
1027 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
1028 break;
1029 default:
1030 unreachable("not reached");
1031 }
1032 }
1033
1034 return inst;
1035 }
1036
1037
1038 /***********************************************************************
1039 * Convenience routines.
1040 */
1041 #define ALU1(OP) \
1042 brw_inst *brw_##OP(struct brw_codegen *p, \
1043 struct brw_reg dest, \
1044 struct brw_reg src0) \
1045 { \
1046 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
1047 }
1048
1049 #define ALU2(OP) \
1050 brw_inst *brw_##OP(struct brw_codegen *p, \
1051 struct brw_reg dest, \
1052 struct brw_reg src0, \
1053 struct brw_reg src1) \
1054 { \
1055 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1056 }
1057
1058 #define ALU3(OP) \
1059 brw_inst *brw_##OP(struct brw_codegen *p, \
1060 struct brw_reg dest, \
1061 struct brw_reg src0, \
1062 struct brw_reg src1, \
1063 struct brw_reg src2) \
1064 { \
1065 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1066 }
1067
1068 #define ALU3F(OP) \
1069 brw_inst *brw_##OP(struct brw_codegen *p, \
1070 struct brw_reg dest, \
1071 struct brw_reg src0, \
1072 struct brw_reg src1, \
1073 struct brw_reg src2) \
1074 { \
1075 assert(dest.type == BRW_REGISTER_TYPE_F || \
1076 dest.type == BRW_REGISTER_TYPE_DF); \
1077 if (dest.type == BRW_REGISTER_TYPE_F) { \
1078 assert(src0.type == BRW_REGISTER_TYPE_F); \
1079 assert(src1.type == BRW_REGISTER_TYPE_F); \
1080 assert(src2.type == BRW_REGISTER_TYPE_F); \
1081 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1082 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1083 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1084 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1085 } \
1086 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1087 }
1088
1089 /* Rounding operations (other than RNDD) require two instructions - the first
1090 * stores a rounded value (possibly the wrong way) in the dest register, but
1091 * also sets a per-channel "increment bit" in the flag register. A predicated
1092 * add of 1.0 fixes dest to contain the desired result.
1093 *
1094 * Sandybridge and later appear to round correctly without an ADD.
1095 */
1096 #define ROUND(OP) \
1097 void brw_##OP(struct brw_codegen *p, \
1098 struct brw_reg dest, \
1099 struct brw_reg src) \
1100 { \
1101 const struct gen_device_info *devinfo = p->devinfo; \
1102 brw_inst *rnd, *add; \
1103 rnd = next_insn(p, BRW_OPCODE_##OP); \
1104 brw_set_dest(p, rnd, dest); \
1105 brw_set_src0(p, rnd, src); \
1106 \
1107 if (devinfo->gen < 6) { \
1108 /* turn on round-increments */ \
1109 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1110 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1111 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1112 } \
1113 }
1114
1115
1116 ALU2(SEL)
1117 ALU1(NOT)
1118 ALU2(AND)
1119 ALU2(OR)
1120 ALU2(XOR)
1121 ALU2(SHR)
1122 ALU2(SHL)
1123 ALU1(DIM)
1124 ALU2(ASR)
1125 ALU1(FRC)
1126 ALU1(RNDD)
1127 ALU2(MAC)
1128 ALU2(MACH)
1129 ALU1(LZD)
1130 ALU2(DP4)
1131 ALU2(DPH)
1132 ALU2(DP3)
1133 ALU2(DP2)
1134 ALU3F(MAD)
1135 ALU3F(LRP)
1136 ALU1(BFREV)
1137 ALU3(BFE)
1138 ALU2(BFI1)
1139 ALU3(BFI2)
1140 ALU1(FBH)
1141 ALU1(FBL)
1142 ALU1(CBIT)
1143 ALU2(ADDC)
1144 ALU2(SUBB)
1145
1146 ROUND(RNDZ)
1147 ROUND(RNDE)
1148
1149 brw_inst *
1150 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1151 {
1152 const struct gen_device_info *devinfo = p->devinfo;
1153
1154 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1155 * To avoid the problems that causes, we use a <1,2,0> source region to read
1156 * each element twice.
1157 */
1158 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1159 brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1 &&
1160 dest.type == BRW_REGISTER_TYPE_DF &&
1161 (src0.type == BRW_REGISTER_TYPE_F ||
1162 src0.type == BRW_REGISTER_TYPE_D ||
1163 src0.type == BRW_REGISTER_TYPE_UD) &&
1164 !has_scalar_region(src0)) {
1165 assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
1166 src0.width == BRW_WIDTH_4 &&
1167 src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1168
1169 src0.vstride = BRW_VERTICAL_STRIDE_1;
1170 src0.width = BRW_WIDTH_2;
1171 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1172 }
1173
1174 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1175 }
1176
1177 brw_inst *
1178 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1179 struct brw_reg src0, struct brw_reg src1)
1180 {
1181 /* 6.2.2: add */
1182 if (src0.type == BRW_REGISTER_TYPE_F ||
1183 (src0.file == BRW_IMMEDIATE_VALUE &&
1184 src0.type == BRW_REGISTER_TYPE_VF)) {
1185 assert(src1.type != BRW_REGISTER_TYPE_UD);
1186 assert(src1.type != BRW_REGISTER_TYPE_D);
1187 }
1188
1189 if (src1.type == BRW_REGISTER_TYPE_F ||
1190 (src1.file == BRW_IMMEDIATE_VALUE &&
1191 src1.type == BRW_REGISTER_TYPE_VF)) {
1192 assert(src0.type != BRW_REGISTER_TYPE_UD);
1193 assert(src0.type != BRW_REGISTER_TYPE_D);
1194 }
1195
1196 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1197 }
1198
1199 brw_inst *
1200 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1201 struct brw_reg src0, struct brw_reg src1)
1202 {
1203 assert(dest.type == src0.type);
1204 assert(src0.type == src1.type);
1205 switch (src0.type) {
1206 case BRW_REGISTER_TYPE_B:
1207 case BRW_REGISTER_TYPE_UB:
1208 case BRW_REGISTER_TYPE_W:
1209 case BRW_REGISTER_TYPE_UW:
1210 case BRW_REGISTER_TYPE_D:
1211 case BRW_REGISTER_TYPE_UD:
1212 break;
1213 default:
1214 unreachable("Bad type for brw_AVG");
1215 }
1216
1217 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1218 }
1219
1220 brw_inst *
1221 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1222 struct brw_reg src0, struct brw_reg src1)
1223 {
1224 /* 6.32.38: mul */
1225 if (src0.type == BRW_REGISTER_TYPE_D ||
1226 src0.type == BRW_REGISTER_TYPE_UD ||
1227 src1.type == BRW_REGISTER_TYPE_D ||
1228 src1.type == BRW_REGISTER_TYPE_UD) {
1229 assert(dest.type != BRW_REGISTER_TYPE_F);
1230 }
1231
1232 if (src0.type == BRW_REGISTER_TYPE_F ||
1233 (src0.file == BRW_IMMEDIATE_VALUE &&
1234 src0.type == BRW_REGISTER_TYPE_VF)) {
1235 assert(src1.type != BRW_REGISTER_TYPE_UD);
1236 assert(src1.type != BRW_REGISTER_TYPE_D);
1237 }
1238
1239 if (src1.type == BRW_REGISTER_TYPE_F ||
1240 (src1.file == BRW_IMMEDIATE_VALUE &&
1241 src1.type == BRW_REGISTER_TYPE_VF)) {
1242 assert(src0.type != BRW_REGISTER_TYPE_UD);
1243 assert(src0.type != BRW_REGISTER_TYPE_D);
1244 }
1245
1246 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1247 src0.nr != BRW_ARF_ACCUMULATOR);
1248 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1249 src1.nr != BRW_ARF_ACCUMULATOR);
1250
1251 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1252 }
1253
1254 brw_inst *
1255 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1256 struct brw_reg src0, struct brw_reg src1)
1257 {
1258 src0.vstride = BRW_VERTICAL_STRIDE_0;
1259 src0.width = BRW_WIDTH_1;
1260 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1261 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1262 }
1263
1264 brw_inst *
1265 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1266 struct brw_reg src0, struct brw_reg src1)
1267 {
1268 src0.vstride = BRW_VERTICAL_STRIDE_0;
1269 src0.width = BRW_WIDTH_1;
1270 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1271 src1.vstride = BRW_VERTICAL_STRIDE_8;
1272 src1.width = BRW_WIDTH_8;
1273 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1274 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1275 }
1276
1277 brw_inst *
1278 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1279 {
1280 const struct gen_device_info *devinfo = p->devinfo;
1281 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1282 /* The F32TO16 instruction doesn't support 32-bit destination types in
1283 * Align1 mode, and neither does the Gen8 implementation in terms of a
1284 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1285 * an undocumented feature.
1286 */
1287 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1288 (!align16 || devinfo->gen >= 8));
1289 brw_inst *inst;
1290
1291 if (align16) {
1292 assert(dst.type == BRW_REGISTER_TYPE_UD);
1293 } else {
1294 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1295 dst.type == BRW_REGISTER_TYPE_W ||
1296 dst.type == BRW_REGISTER_TYPE_UW ||
1297 dst.type == BRW_REGISTER_TYPE_HF);
1298 }
1299
1300 brw_push_insn_state(p);
1301
1302 if (needs_zero_fill) {
1303 brw_set_default_access_mode(p, BRW_ALIGN_1);
1304 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1305 }
1306
1307 if (devinfo->gen >= 8) {
1308 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1309 } else {
1310 assert(devinfo->gen == 7);
1311 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1312 }
1313
1314 if (needs_zero_fill) {
1315 brw_inst_set_no_dd_clear(devinfo, inst, true);
1316 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1317 brw_inst_set_no_dd_check(devinfo, inst, true);
1318 }
1319
1320 brw_pop_insn_state(p);
1321 return inst;
1322 }
1323
1324 brw_inst *
1325 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1326 {
1327 const struct gen_device_info *devinfo = p->devinfo;
1328 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1329
1330 if (align16) {
1331 assert(src.type == BRW_REGISTER_TYPE_UD);
1332 } else {
1333 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1334 *
1335 * Because this instruction does not have a 16-bit floating-point
1336 * type, the source data type must be Word (W). The destination type
1337 * must be F (Float).
1338 */
1339 if (src.type == BRW_REGISTER_TYPE_UD)
1340 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1341
1342 assert(src.type == BRW_REGISTER_TYPE_W ||
1343 src.type == BRW_REGISTER_TYPE_UW ||
1344 src.type == BRW_REGISTER_TYPE_HF);
1345 }
1346
1347 if (devinfo->gen >= 8) {
1348 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1349 } else {
1350 assert(devinfo->gen == 7);
1351 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1352 }
1353 }
1354
1355
1356 void brw_NOP(struct brw_codegen *p)
1357 {
1358 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1359 memset(insn, 0, sizeof(*insn));
1360 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1361 }
1362
1363
1364
1365
1366
1367 /***********************************************************************
1368 * Comparisons, if/else/endif
1369 */
1370
1371 brw_inst *
1372 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1373 unsigned predicate_control)
1374 {
1375 const struct gen_device_info *devinfo = p->devinfo;
1376 struct brw_reg ip = brw_ip_reg();
1377 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1378
1379 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1380 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1381 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1382 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1383
1384 return inst;
1385 }
1386
1387 static void
1388 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1389 {
1390 p->if_stack[p->if_stack_depth] = inst - p->store;
1391
1392 p->if_stack_depth++;
1393 if (p->if_stack_array_size <= p->if_stack_depth) {
1394 p->if_stack_array_size *= 2;
1395 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1396 p->if_stack_array_size);
1397 }
1398 }
1399
1400 static brw_inst *
1401 pop_if_stack(struct brw_codegen *p)
1402 {
1403 p->if_stack_depth--;
1404 return &p->store[p->if_stack[p->if_stack_depth]];
1405 }
1406
1407 static void
1408 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1409 {
1410 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1411 p->loop_stack_array_size *= 2;
1412 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1413 p->loop_stack_array_size);
1414 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1415 p->loop_stack_array_size);
1416 }
1417
1418 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1419 p->loop_stack_depth++;
1420 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1421 }
1422
1423 static brw_inst *
1424 get_inner_do_insn(struct brw_codegen *p)
1425 {
1426 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1427 }
1428
1429 /* EU takes the value from the flag register and pushes it onto some
1430 * sort of a stack (presumably merging with any flag value already on
1431 * the stack). Within an if block, the flags at the top of the stack
1432 * control execution on each channel of the unit, eg. on each of the
1433 * 16 pixel values in our wm programs.
1434 *
1435 * When the matching 'else' instruction is reached (presumably by
1436 * countdown of the instruction count patched in by our ELSE/ENDIF
1437 * functions), the relevant flags are inverted.
1438 *
1439 * When the matching 'endif' instruction is reached, the flags are
1440 * popped off. If the stack is now empty, normal execution resumes.
1441 */
1442 brw_inst *
1443 brw_IF(struct brw_codegen *p, unsigned execute_size)
1444 {
1445 const struct gen_device_info *devinfo = p->devinfo;
1446 brw_inst *insn;
1447
1448 insn = next_insn(p, BRW_OPCODE_IF);
1449
1450 /* Override the defaults for this instruction:
1451 */
1452 if (devinfo->gen < 6) {
1453 brw_set_dest(p, insn, brw_ip_reg());
1454 brw_set_src0(p, insn, brw_ip_reg());
1455 brw_set_src1(p, insn, brw_imm_d(0x0));
1456 } else if (devinfo->gen == 6) {
1457 brw_set_dest(p, insn, brw_imm_w(0));
1458 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1459 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1460 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1461 } else if (devinfo->gen == 7) {
1462 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1463 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1464 brw_set_src1(p, insn, brw_imm_w(0));
1465 brw_inst_set_jip(devinfo, insn, 0);
1466 brw_inst_set_uip(devinfo, insn, 0);
1467 } else {
1468 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1469 brw_set_src0(p, insn, brw_imm_d(0));
1470 brw_inst_set_jip(devinfo, insn, 0);
1471 brw_inst_set_uip(devinfo, insn, 0);
1472 }
1473
1474 brw_inst_set_exec_size(devinfo, insn, execute_size);
1475 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1476 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1477 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1478 if (!p->single_program_flow && devinfo->gen < 6)
1479 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1480
1481 push_if_stack(p, insn);
1482 p->if_depth_in_loop[p->loop_stack_depth]++;
1483 return insn;
1484 }
1485
1486 /* This function is only used for gen6-style IF instructions with an
1487 * embedded comparison (conditional modifier). It is not used on gen7.
1488 */
1489 brw_inst *
1490 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1491 struct brw_reg src0, struct brw_reg src1)
1492 {
1493 const struct gen_device_info *devinfo = p->devinfo;
1494 brw_inst *insn;
1495
1496 insn = next_insn(p, BRW_OPCODE_IF);
1497
1498 brw_set_dest(p, insn, brw_imm_w(0));
1499 brw_inst_set_exec_size(devinfo, insn,
1500 brw_inst_exec_size(devinfo, p->current));
1501 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1502 brw_set_src0(p, insn, src0);
1503 brw_set_src1(p, insn, src1);
1504
1505 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1506 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1507 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1508
1509 push_if_stack(p, insn);
1510 return insn;
1511 }
1512
1513 /**
1514 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1515 */
1516 static void
1517 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1518 brw_inst *if_inst, brw_inst *else_inst)
1519 {
1520 const struct gen_device_info *devinfo = p->devinfo;
1521
1522 /* The next instruction (where the ENDIF would be, if it existed) */
1523 brw_inst *next_inst = &p->store[p->nr_insn];
1524
1525 assert(p->single_program_flow);
1526 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1527 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1528 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1529
1530 /* Convert IF to an ADD instruction that moves the instruction pointer
1531 * to the first instruction of the ELSE block. If there is no ELSE
1532 * block, point to where ENDIF would be. Reverse the predicate.
1533 *
1534 * There's no need to execute an ENDIF since we don't need to do any
1535 * stack operations, and if we're currently executing, we just want to
1536 * continue normally.
1537 */
1538 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1539 brw_inst_set_pred_inv(devinfo, if_inst, true);
1540
1541 if (else_inst != NULL) {
1542 /* Convert ELSE to an ADD instruction that points where the ENDIF
1543 * would be.
1544 */
1545 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1546
1547 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1548 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1549 } else {
1550 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1551 }
1552 }
1553
1554 /**
1555 * Patch IF and ELSE instructions with appropriate jump targets.
1556 */
1557 static void
1558 patch_IF_ELSE(struct brw_codegen *p,
1559 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1560 {
1561 const struct gen_device_info *devinfo = p->devinfo;
1562
1563 /* We shouldn't be patching IF and ELSE instructions in single program flow
1564 * mode when gen < 6, because in single program flow mode on those
1565 * platforms, we convert flow control instructions to conditional ADDs that
1566 * operate on IP (see brw_ENDIF).
1567 *
1568 * However, on Gen6, writing to IP doesn't work in single program flow mode
1569 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1570 * not be updated by non-flow control instructions."). And on later
1571 * platforms, there is no significant benefit to converting control flow
1572 * instructions to conditional ADDs. So we do patch IF and ELSE
1573 * instructions in single program flow mode on those platforms.
1574 */
1575 if (devinfo->gen < 6)
1576 assert(!p->single_program_flow);
1577
1578 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1579 assert(endif_inst != NULL);
1580 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1581
1582 unsigned br = brw_jump_scale(devinfo);
1583
1584 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1585 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1586
1587 if (else_inst == NULL) {
1588 /* Patch IF -> ENDIF */
1589 if (devinfo->gen < 6) {
1590 /* Turn it into an IFF, which means no mask stack operations for
1591 * all-false and jumping past the ENDIF.
1592 */
1593 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1594 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1595 br * (endif_inst - if_inst + 1));
1596 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1597 } else if (devinfo->gen == 6) {
1598 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1599 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1600 } else {
1601 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1602 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1603 }
1604 } else {
1605 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1606
1607 /* Patch IF -> ELSE */
1608 if (devinfo->gen < 6) {
1609 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1610 br * (else_inst - if_inst));
1611 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1612 } else if (devinfo->gen == 6) {
1613 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1614 br * (else_inst - if_inst + 1));
1615 }
1616
1617 /* Patch ELSE -> ENDIF */
1618 if (devinfo->gen < 6) {
1619 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1620 * matching ENDIF.
1621 */
1622 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1623 br * (endif_inst - else_inst + 1));
1624 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1625 } else if (devinfo->gen == 6) {
1626 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1627 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1628 br * (endif_inst - else_inst));
1629 } else {
1630 /* The IF instruction's JIP should point just past the ELSE */
1631 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1632 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1633 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1634 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1635 if (devinfo->gen >= 8) {
1636 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1637 * should point to ENDIF.
1638 */
1639 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1640 }
1641 }
1642 }
1643 }
1644
1645 void
1646 brw_ELSE(struct brw_codegen *p)
1647 {
1648 const struct gen_device_info *devinfo = p->devinfo;
1649 brw_inst *insn;
1650
1651 insn = next_insn(p, BRW_OPCODE_ELSE);
1652
1653 if (devinfo->gen < 6) {
1654 brw_set_dest(p, insn, brw_ip_reg());
1655 brw_set_src0(p, insn, brw_ip_reg());
1656 brw_set_src1(p, insn, brw_imm_d(0x0));
1657 } else if (devinfo->gen == 6) {
1658 brw_set_dest(p, insn, brw_imm_w(0));
1659 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1660 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1661 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1662 } else if (devinfo->gen == 7) {
1663 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1664 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1665 brw_set_src1(p, insn, brw_imm_w(0));
1666 brw_inst_set_jip(devinfo, insn, 0);
1667 brw_inst_set_uip(devinfo, insn, 0);
1668 } else {
1669 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1670 brw_set_src0(p, insn, brw_imm_d(0));
1671 brw_inst_set_jip(devinfo, insn, 0);
1672 brw_inst_set_uip(devinfo, insn, 0);
1673 }
1674
1675 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1676 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1677 if (!p->single_program_flow && devinfo->gen < 6)
1678 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1679
1680 push_if_stack(p, insn);
1681 }
1682
1683 void
1684 brw_ENDIF(struct brw_codegen *p)
1685 {
1686 const struct gen_device_info *devinfo = p->devinfo;
1687 brw_inst *insn = NULL;
1688 brw_inst *else_inst = NULL;
1689 brw_inst *if_inst = NULL;
1690 brw_inst *tmp;
1691 bool emit_endif = true;
1692
1693 /* In single program flow mode, we can express IF and ELSE instructions
1694 * equivalently as ADD instructions that operate on IP. On platforms prior
1695 * to Gen6, flow control instructions cause an implied thread switch, so
1696 * this is a significant savings.
1697 *
1698 * However, on Gen6, writing to IP doesn't work in single program flow mode
1699 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1700 * not be updated by non-flow control instructions."). And on later
1701 * platforms, there is no significant benefit to converting control flow
1702 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1703 * Gen5.
1704 */
1705 if (devinfo->gen < 6 && p->single_program_flow)
1706 emit_endif = false;
1707
1708 /*
1709 * A single next_insn() may change the base address of instruction store
1710 * memory(p->store), so call it first before referencing the instruction
1711 * store pointer from an index
1712 */
1713 if (emit_endif)
1714 insn = next_insn(p, BRW_OPCODE_ENDIF);
1715
1716 /* Pop the IF and (optional) ELSE instructions from the stack */
1717 p->if_depth_in_loop[p->loop_stack_depth]--;
1718 tmp = pop_if_stack(p);
1719 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1720 else_inst = tmp;
1721 tmp = pop_if_stack(p);
1722 }
1723 if_inst = tmp;
1724
1725 if (!emit_endif) {
1726 /* ENDIF is useless; don't bother emitting it. */
1727 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1728 return;
1729 }
1730
1731 if (devinfo->gen < 6) {
1732 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1733 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1734 brw_set_src1(p, insn, brw_imm_d(0x0));
1735 } else if (devinfo->gen == 6) {
1736 brw_set_dest(p, insn, brw_imm_w(0));
1737 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1738 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1739 } else if (devinfo->gen == 7) {
1740 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1741 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1742 brw_set_src1(p, insn, brw_imm_w(0));
1743 } else {
1744 brw_set_src0(p, insn, brw_imm_d(0));
1745 }
1746
1747 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1748 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1749 if (devinfo->gen < 6)
1750 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1751
1752 /* Also pop item off the stack in the endif instruction: */
1753 if (devinfo->gen < 6) {
1754 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1755 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1756 } else if (devinfo->gen == 6) {
1757 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1758 } else {
1759 brw_inst_set_jip(devinfo, insn, 2);
1760 }
1761 patch_IF_ELSE(p, if_inst, else_inst, insn);
1762 }
1763
1764 brw_inst *
1765 brw_BREAK(struct brw_codegen *p)
1766 {
1767 const struct gen_device_info *devinfo = p->devinfo;
1768 brw_inst *insn;
1769
1770 insn = next_insn(p, BRW_OPCODE_BREAK);
1771 if (devinfo->gen >= 8) {
1772 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1773 brw_set_src0(p, insn, brw_imm_d(0x0));
1774 } else if (devinfo->gen >= 6) {
1775 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1776 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1777 brw_set_src1(p, insn, brw_imm_d(0x0));
1778 } else {
1779 brw_set_dest(p, insn, brw_ip_reg());
1780 brw_set_src0(p, insn, brw_ip_reg());
1781 brw_set_src1(p, insn, brw_imm_d(0x0));
1782 brw_inst_set_gen4_pop_count(devinfo, insn,
1783 p->if_depth_in_loop[p->loop_stack_depth]);
1784 }
1785 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1786 brw_inst_set_exec_size(devinfo, insn,
1787 brw_inst_exec_size(devinfo, p->current));
1788
1789 return insn;
1790 }
1791
1792 brw_inst *
1793 brw_CONT(struct brw_codegen *p)
1794 {
1795 const struct gen_device_info *devinfo = p->devinfo;
1796 brw_inst *insn;
1797
1798 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1799 brw_set_dest(p, insn, brw_ip_reg());
1800 if (devinfo->gen >= 8) {
1801 brw_set_src0(p, insn, brw_imm_d(0x0));
1802 } else {
1803 brw_set_src0(p, insn, brw_ip_reg());
1804 brw_set_src1(p, insn, brw_imm_d(0x0));
1805 }
1806
1807 if (devinfo->gen < 6) {
1808 brw_inst_set_gen4_pop_count(devinfo, insn,
1809 p->if_depth_in_loop[p->loop_stack_depth]);
1810 }
1811 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1812 brw_inst_set_exec_size(devinfo, insn,
1813 brw_inst_exec_size(devinfo, p->current));
1814 return insn;
1815 }
1816
1817 brw_inst *
1818 gen6_HALT(struct brw_codegen *p)
1819 {
1820 const struct gen_device_info *devinfo = p->devinfo;
1821 brw_inst *insn;
1822
1823 insn = next_insn(p, BRW_OPCODE_HALT);
1824 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1825 if (devinfo->gen >= 8) {
1826 brw_set_src0(p, insn, brw_imm_d(0x0));
1827 } else {
1828 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1829 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1830 }
1831
1832 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1833 brw_inst_set_exec_size(devinfo, insn,
1834 brw_inst_exec_size(devinfo, p->current));
1835 return insn;
1836 }
1837
1838 /* DO/WHILE loop:
1839 *
1840 * The DO/WHILE is just an unterminated loop -- break or continue are
1841 * used for control within the loop. We have a few ways they can be
1842 * done.
1843 *
1844 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1845 * jip and no DO instruction.
1846 *
1847 * For non-uniform control flow pre-gen6, there's a DO instruction to
1848 * push the mask, and a WHILE to jump back, and BREAK to get out and
1849 * pop the mask.
1850 *
1851 * For gen6, there's no more mask stack, so no need for DO. WHILE
1852 * just points back to the first instruction of the loop.
1853 */
1854 brw_inst *
1855 brw_DO(struct brw_codegen *p, unsigned execute_size)
1856 {
1857 const struct gen_device_info *devinfo = p->devinfo;
1858
1859 if (devinfo->gen >= 6 || p->single_program_flow) {
1860 push_loop_stack(p, &p->store[p->nr_insn]);
1861 return &p->store[p->nr_insn];
1862 } else {
1863 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1864
1865 push_loop_stack(p, insn);
1866
1867 /* Override the defaults for this instruction:
1868 */
1869 brw_set_dest(p, insn, brw_null_reg());
1870 brw_set_src0(p, insn, brw_null_reg());
1871 brw_set_src1(p, insn, brw_null_reg());
1872
1873 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1874 brw_inst_set_exec_size(devinfo, insn, execute_size);
1875 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1876
1877 return insn;
1878 }
1879 }
1880
1881 /**
1882 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1883 * instruction here.
1884 *
1885 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1886 * nesting, since it can always just point to the end of the block/current loop.
1887 */
1888 static void
1889 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1890 {
1891 const struct gen_device_info *devinfo = p->devinfo;
1892 brw_inst *do_inst = get_inner_do_insn(p);
1893 brw_inst *inst;
1894 unsigned br = brw_jump_scale(devinfo);
1895
1896 assert(devinfo->gen < 6);
1897
1898 for (inst = while_inst - 1; inst != do_inst; inst--) {
1899 /* If the jump count is != 0, that means that this instruction has already
1900 * been patched because it's part of a loop inside of the one we're
1901 * patching.
1902 */
1903 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1904 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1905 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1906 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1907 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1908 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1909 }
1910 }
1911 }
1912
1913 brw_inst *
1914 brw_WHILE(struct brw_codegen *p)
1915 {
1916 const struct gen_device_info *devinfo = p->devinfo;
1917 brw_inst *insn, *do_insn;
1918 unsigned br = brw_jump_scale(devinfo);
1919
1920 if (devinfo->gen >= 6) {
1921 insn = next_insn(p, BRW_OPCODE_WHILE);
1922 do_insn = get_inner_do_insn(p);
1923
1924 if (devinfo->gen >= 8) {
1925 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1926 brw_set_src0(p, insn, brw_imm_d(0));
1927 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1928 } else if (devinfo->gen == 7) {
1929 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1930 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1931 brw_set_src1(p, insn, brw_imm_w(0));
1932 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1933 } else {
1934 brw_set_dest(p, insn, brw_imm_w(0));
1935 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1936 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1937 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1938 }
1939
1940 brw_inst_set_exec_size(devinfo, insn,
1941 brw_inst_exec_size(devinfo, p->current));
1942
1943 } else {
1944 if (p->single_program_flow) {
1945 insn = next_insn(p, BRW_OPCODE_ADD);
1946 do_insn = get_inner_do_insn(p);
1947
1948 brw_set_dest(p, insn, brw_ip_reg());
1949 brw_set_src0(p, insn, brw_ip_reg());
1950 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1951 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1952 } else {
1953 insn = next_insn(p, BRW_OPCODE_WHILE);
1954 do_insn = get_inner_do_insn(p);
1955
1956 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1957
1958 brw_set_dest(p, insn, brw_ip_reg());
1959 brw_set_src0(p, insn, brw_ip_reg());
1960 brw_set_src1(p, insn, brw_imm_d(0));
1961
1962 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1963 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1964 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1965
1966 brw_patch_break_cont(p, insn);
1967 }
1968 }
1969 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1970
1971 p->loop_stack_depth--;
1972
1973 return insn;
1974 }
1975
1976 /* FORWARD JUMPS:
1977 */
1978 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1979 {
1980 const struct gen_device_info *devinfo = p->devinfo;
1981 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1982 unsigned jmpi = 1;
1983
1984 if (devinfo->gen >= 5)
1985 jmpi = 2;
1986
1987 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1988 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1989
1990 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1991 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1992 }
1993
1994 /* To integrate with the above, it makes sense that the comparison
1995 * instruction should populate the flag register. It might be simpler
1996 * just to use the flag reg for most WM tasks?
1997 */
1998 void brw_CMP(struct brw_codegen *p,
1999 struct brw_reg dest,
2000 unsigned conditional,
2001 struct brw_reg src0,
2002 struct brw_reg src1)
2003 {
2004 const struct gen_device_info *devinfo = p->devinfo;
2005 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
2006
2007 brw_inst_set_cond_modifier(devinfo, insn, conditional);
2008 brw_set_dest(p, insn, dest);
2009 brw_set_src0(p, insn, src0);
2010 brw_set_src1(p, insn, src1);
2011
2012 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
2013 * page says:
2014 * "Any CMP instruction with a null destination must use a {switch}."
2015 *
2016 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
2017 * mentioned on their work-arounds pages.
2018 */
2019 if (devinfo->gen == 7) {
2020 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2021 dest.nr == BRW_ARF_NULL) {
2022 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2023 }
2024 }
2025 }
2026
2027 /***********************************************************************
2028 * Helpers for the various SEND message types:
2029 */
2030
2031 /** Extended math function, float[8].
2032 */
2033 void gen4_math(struct brw_codegen *p,
2034 struct brw_reg dest,
2035 unsigned function,
2036 unsigned msg_reg_nr,
2037 struct brw_reg src,
2038 unsigned precision )
2039 {
2040 const struct gen_device_info *devinfo = p->devinfo;
2041 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2042 unsigned data_type;
2043 if (has_scalar_region(src)) {
2044 data_type = BRW_MATH_DATA_SCALAR;
2045 } else {
2046 data_type = BRW_MATH_DATA_VECTOR;
2047 }
2048
2049 assert(devinfo->gen < 6);
2050
2051 /* Example code doesn't set predicate_control for send
2052 * instructions.
2053 */
2054 brw_inst_set_pred_control(devinfo, insn, 0);
2055 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2056
2057 brw_set_dest(p, insn, dest);
2058 brw_set_src0(p, insn, src);
2059 brw_set_math_message(p,
2060 insn,
2061 function,
2062 src.type == BRW_REGISTER_TYPE_D,
2063 precision,
2064 data_type);
2065 }
2066
2067 void gen6_math(struct brw_codegen *p,
2068 struct brw_reg dest,
2069 unsigned function,
2070 struct brw_reg src0,
2071 struct brw_reg src1)
2072 {
2073 const struct gen_device_info *devinfo = p->devinfo;
2074 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2075
2076 assert(devinfo->gen >= 6);
2077
2078 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2079 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2080
2081 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2082 if (devinfo->gen == 6) {
2083 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2084 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2085 }
2086
2087 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2088 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2089 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2090 assert(src0.type != BRW_REGISTER_TYPE_F);
2091 assert(src1.type != BRW_REGISTER_TYPE_F);
2092 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2093 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2094 } else {
2095 assert(src0.type == BRW_REGISTER_TYPE_F);
2096 assert(src1.type == BRW_REGISTER_TYPE_F);
2097 }
2098
2099 /* Source modifiers are ignored for extended math instructions on Gen6. */
2100 if (devinfo->gen == 6) {
2101 assert(!src0.negate);
2102 assert(!src0.abs);
2103 assert(!src1.negate);
2104 assert(!src1.abs);
2105 }
2106
2107 brw_inst_set_math_function(devinfo, insn, function);
2108
2109 brw_set_dest(p, insn, dest);
2110 brw_set_src0(p, insn, src0);
2111 brw_set_src1(p, insn, src1);
2112 }
2113
2114 /**
2115 * Return the right surface index to access the thread scratch space using
2116 * stateless dataport messages.
2117 */
2118 unsigned
2119 brw_scratch_surface_idx(const struct brw_codegen *p)
2120 {
2121 /* The scratch space is thread-local so IA coherency is unnecessary. */
2122 if (p->devinfo->gen >= 8)
2123 return GEN8_BTI_STATELESS_NON_COHERENT;
2124 else
2125 return BRW_BTI_STATELESS;
2126 }
2127
2128 /**
2129 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2130 * using a constant offset per channel.
2131 *
2132 * The offset must be aligned to oword size (16 bytes). Used for
2133 * register spilling.
2134 */
2135 void brw_oword_block_write_scratch(struct brw_codegen *p,
2136 struct brw_reg mrf,
2137 int num_regs,
2138 unsigned offset)
2139 {
2140 const struct gen_device_info *devinfo = p->devinfo;
2141 const unsigned target_cache =
2142 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2143 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2144 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2145 uint32_t msg_type;
2146
2147 if (devinfo->gen >= 6)
2148 offset /= 16;
2149
2150 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2151
2152 const unsigned mlen = 1 + num_regs;
2153
2154 /* Set up the message header. This is g0, with g0.2 filled with
2155 * the offset. We don't want to leave our offset around in g0 or
2156 * it'll screw up texture samples, so set it up inside the message
2157 * reg.
2158 */
2159 {
2160 brw_push_insn_state(p);
2161 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2162 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2163 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2164
2165 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2166
2167 /* set message header global offset field (reg 0, element 2) */
2168 brw_MOV(p,
2169 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2170 mrf.nr,
2171 2), BRW_REGISTER_TYPE_UD),
2172 brw_imm_ud(offset));
2173
2174 brw_pop_insn_state(p);
2175 }
2176
2177 {
2178 struct brw_reg dest;
2179 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2180 int send_commit_msg;
2181 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2182 BRW_REGISTER_TYPE_UW);
2183
2184 brw_inst_set_compression(devinfo, insn, false);
2185
2186 if (brw_inst_exec_size(devinfo, insn) >= 16)
2187 src_header = vec16(src_header);
2188
2189 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2190 if (devinfo->gen < 6)
2191 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2192
2193 /* Until gen6, writes followed by reads from the same location
2194 * are not guaranteed to be ordered unless write_commit is set.
2195 * If set, then a no-op write is issued to the destination
2196 * register to set a dependency, and a read from the destination
2197 * can be used to ensure the ordering.
2198 *
2199 * For gen6, only writes between different threads need ordering
2200 * protection. Our use of DP writes is all about register
2201 * spilling within a thread.
2202 */
2203 if (devinfo->gen >= 6) {
2204 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2205 send_commit_msg = 0;
2206 } else {
2207 dest = src_header;
2208 send_commit_msg = 1;
2209 }
2210
2211 brw_set_dest(p, insn, dest);
2212 if (devinfo->gen >= 6) {
2213 brw_set_src0(p, insn, mrf);
2214 } else {
2215 brw_set_src0(p, insn, brw_null_reg());
2216 }
2217
2218 if (devinfo->gen >= 6)
2219 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2220 else
2221 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2222
2223 brw_set_dp_write_message(p,
2224 insn,
2225 brw_scratch_surface_idx(p),
2226 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2227 msg_type,
2228 target_cache,
2229 mlen,
2230 true, /* header_present */
2231 0, /* not a render target */
2232 send_commit_msg, /* response_length */
2233 0, /* eot */
2234 send_commit_msg);
2235 }
2236 }
2237
2238
2239 /**
2240 * Read a block of owords (half a GRF each) from the scratch buffer
2241 * using a constant index per channel.
2242 *
2243 * Offset must be aligned to oword size (16 bytes). Used for register
2244 * spilling.
2245 */
2246 void
2247 brw_oword_block_read_scratch(struct brw_codegen *p,
2248 struct brw_reg dest,
2249 struct brw_reg mrf,
2250 int num_regs,
2251 unsigned offset)
2252 {
2253 const struct gen_device_info *devinfo = p->devinfo;
2254
2255 if (devinfo->gen >= 6)
2256 offset /= 16;
2257
2258 if (p->devinfo->gen >= 7) {
2259 /* On gen 7 and above, we no longer have message registers and we can
2260 * send from any register we want. By using the destination register
2261 * for the message, we guarantee that the implied message write won't
2262 * accidentally overwrite anything. This has been a problem because
2263 * the MRF registers and source for the final FB write are both fixed
2264 * and may overlap.
2265 */
2266 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2267 } else {
2268 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2269 }
2270 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2271
2272 const unsigned rlen = num_regs;
2273 const unsigned target_cache =
2274 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2275 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2276 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2277
2278 {
2279 brw_push_insn_state(p);
2280 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2281 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2282 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2283
2284 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2285
2286 /* set message header global offset field (reg 0, element 2) */
2287 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2288
2289 brw_pop_insn_state(p);
2290 }
2291
2292 {
2293 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2294
2295 assert(brw_inst_pred_control(devinfo, insn) == 0);
2296 brw_inst_set_compression(devinfo, insn, false);
2297
2298 brw_set_dest(p, insn, dest); /* UW? */
2299 if (devinfo->gen >= 6) {
2300 brw_set_src0(p, insn, mrf);
2301 } else {
2302 brw_set_src0(p, insn, brw_null_reg());
2303 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2304 }
2305
2306 brw_set_dp_read_message(p,
2307 insn,
2308 brw_scratch_surface_idx(p),
2309 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2310 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2311 target_cache,
2312 1, /* msg_length */
2313 true, /* header_present */
2314 rlen);
2315 }
2316 }
2317
2318 void
2319 gen7_block_read_scratch(struct brw_codegen *p,
2320 struct brw_reg dest,
2321 int num_regs,
2322 unsigned offset)
2323 {
2324 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2325 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2326
2327 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2328
2329 /* The HW requires that the header is present; this is to get the g0.5
2330 * scratch offset.
2331 */
2332 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2333
2334 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2335 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2336 * is 32 bytes, which happens to be the size of a register.
2337 */
2338 offset /= REG_SIZE;
2339 assert(offset < (1 << 12));
2340
2341 gen7_set_dp_scratch_message(p, insn,
2342 false, /* scratch read */
2343 false, /* OWords */
2344 false, /* invalidate after read */
2345 num_regs,
2346 offset,
2347 1, /* mlen: just g0 */
2348 num_regs, /* rlen */
2349 true); /* header present */
2350 }
2351
2352 /**
2353 * Read float[4] vectors from the data port constant cache.
2354 * Location (in buffer) should be a multiple of 16.
2355 * Used for fetching shader constants.
2356 */
2357 void brw_oword_block_read(struct brw_codegen *p,
2358 struct brw_reg dest,
2359 struct brw_reg mrf,
2360 uint32_t offset,
2361 uint32_t bind_table_index)
2362 {
2363 const struct gen_device_info *devinfo = p->devinfo;
2364 const unsigned target_cache =
2365 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2366 BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2367 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
2368
2369 /* On newer hardware, offset is in units of owords. */
2370 if (devinfo->gen >= 6)
2371 offset /= 16;
2372
2373 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2374
2375 brw_push_insn_state(p);
2376 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2377 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2378 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2379
2380 brw_push_insn_state(p);
2381 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2382 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2383
2384 /* set message header global offset field (reg 0, element 2) */
2385 brw_MOV(p,
2386 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2387 mrf.nr,
2388 2), BRW_REGISTER_TYPE_UD),
2389 brw_imm_ud(offset));
2390 brw_pop_insn_state(p);
2391
2392 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2393
2394 /* cast dest to a uword[8] vector */
2395 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2396
2397 brw_set_dest(p, insn, dest);
2398 if (devinfo->gen >= 6) {
2399 brw_set_src0(p, insn, mrf);
2400 } else {
2401 brw_set_src0(p, insn, brw_null_reg());
2402 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2403 }
2404
2405 brw_set_dp_read_message(p, insn, bind_table_index,
2406 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2407 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2408 target_cache,
2409 1, /* msg_length */
2410 true, /* header_present */
2411 DIV_ROUND_UP(exec_size, 8)); /* response_length */
2412
2413 brw_pop_insn_state(p);
2414 }
2415
2416
2417 void brw_fb_WRITE(struct brw_codegen *p,
2418 struct brw_reg payload,
2419 struct brw_reg implied_header,
2420 unsigned msg_control,
2421 unsigned binding_table_index,
2422 unsigned msg_length,
2423 unsigned response_length,
2424 bool eot,
2425 bool last_render_target,
2426 bool header_present)
2427 {
2428 const struct gen_device_info *devinfo = p->devinfo;
2429 const unsigned target_cache =
2430 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2431 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2432 brw_inst *insn;
2433 unsigned msg_type;
2434 struct brw_reg dest, src0;
2435
2436 if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2437 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2438 else
2439 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2440
2441 if (devinfo->gen >= 6) {
2442 insn = next_insn(p, BRW_OPCODE_SENDC);
2443 } else {
2444 insn = next_insn(p, BRW_OPCODE_SEND);
2445 }
2446 brw_inst_set_compression(devinfo, insn, false);
2447
2448 if (devinfo->gen >= 6) {
2449 /* headerless version, just submit color payload */
2450 src0 = payload;
2451
2452 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2453 } else {
2454 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2455 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2456 src0 = implied_header;
2457
2458 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2459 }
2460
2461 brw_set_dest(p, insn, dest);
2462 brw_set_src0(p, insn, src0);
2463 brw_set_dp_write_message(p,
2464 insn,
2465 binding_table_index,
2466 msg_control,
2467 msg_type,
2468 target_cache,
2469 msg_length,
2470 header_present,
2471 last_render_target,
2472 response_length,
2473 eot,
2474 0 /* send_commit_msg */);
2475 }
2476
2477 brw_inst *
2478 gen9_fb_READ(struct brw_codegen *p,
2479 struct brw_reg dst,
2480 struct brw_reg payload,
2481 unsigned binding_table_index,
2482 unsigned msg_length,
2483 unsigned response_length,
2484 bool per_sample)
2485 {
2486 const struct gen_device_info *devinfo = p->devinfo;
2487 assert(devinfo->gen >= 9);
2488 const unsigned msg_subtype =
2489 brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2490 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2491
2492 brw_set_dest(p, insn, dst);
2493 brw_set_src0(p, insn, payload);
2494 brw_set_dp_read_message(p, insn, binding_table_index,
2495 per_sample << 5 | msg_subtype,
2496 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2497 GEN6_SFID_DATAPORT_RENDER_CACHE,
2498 msg_length, true /* header_present */,
2499 response_length);
2500 brw_inst_set_rt_slot_group(devinfo, insn,
2501 brw_inst_qtr_control(devinfo, p->current) / 2);
2502
2503 return insn;
2504 }
2505
2506 /**
2507 * Texture sample instruction.
2508 * Note: the msg_type plus msg_length values determine exactly what kind
2509 * of sampling operation is performed. See volume 4, page 161 of docs.
2510 */
2511 void brw_SAMPLE(struct brw_codegen *p,
2512 struct brw_reg dest,
2513 unsigned msg_reg_nr,
2514 struct brw_reg src0,
2515 unsigned binding_table_index,
2516 unsigned sampler,
2517 unsigned msg_type,
2518 unsigned response_length,
2519 unsigned msg_length,
2520 unsigned header_present,
2521 unsigned simd_mode,
2522 unsigned return_format)
2523 {
2524 const struct gen_device_info *devinfo = p->devinfo;
2525 brw_inst *insn;
2526
2527 if (msg_reg_nr != -1)
2528 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2529
2530 insn = next_insn(p, BRW_OPCODE_SEND);
2531 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2532
2533 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2534 *
2535 * "Instruction compression is not allowed for this instruction (that
2536 * is, send). The hardware behavior is undefined if this instruction is
2537 * set as compressed. However, compress control can be set to "SecHalf"
2538 * to affect the EMask generation."
2539 *
2540 * No similar wording is found in later PRMs, but there are examples
2541 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2542 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2543 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2544 */
2545 brw_inst_set_compression(devinfo, insn, false);
2546
2547 if (devinfo->gen < 6)
2548 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2549
2550 brw_set_dest(p, insn, dest);
2551 brw_set_src0(p, insn, src0);
2552 brw_set_sampler_message(p, insn,
2553 binding_table_index,
2554 sampler,
2555 msg_type,
2556 response_length,
2557 msg_length,
2558 header_present,
2559 simd_mode,
2560 return_format);
2561 }
2562
2563 /* Adjust the message header's sampler state pointer to
2564 * select the correct group of 16 samplers.
2565 */
2566 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2567 struct brw_reg header,
2568 struct brw_reg sampler_index)
2569 {
2570 /* The "Sampler Index" field can only store values between 0 and 15.
2571 * However, we can add an offset to the "Sampler State Pointer"
2572 * field, effectively selecting a different set of 16 samplers.
2573 *
2574 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2575 * offset, and each sampler state is only 16-bytes, so we can't
2576 * exclusively use the offset - we have to use both.
2577 */
2578
2579 const struct gen_device_info *devinfo = p->devinfo;
2580
2581 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2582 const int sampler_state_size = 16; /* 16 bytes */
2583 uint32_t sampler = sampler_index.ud;
2584
2585 if (sampler >= 16) {
2586 assert(devinfo->is_haswell || devinfo->gen >= 8);
2587 brw_ADD(p,
2588 get_element_ud(header, 3),
2589 get_element_ud(brw_vec8_grf(0, 0), 3),
2590 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2591 }
2592 } else {
2593 /* Non-const sampler array indexing case */
2594 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2595 return;
2596 }
2597
2598 struct brw_reg temp = get_element_ud(header, 3);
2599
2600 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2601 brw_SHL(p, temp, temp, brw_imm_ud(4));
2602 brw_ADD(p,
2603 get_element_ud(header, 3),
2604 get_element_ud(brw_vec8_grf(0, 0), 3),
2605 temp);
2606 }
2607 }
2608
2609 /* All these variables are pretty confusing - we might be better off
2610 * using bitmasks and macros for this, in the old style. Or perhaps
2611 * just having the caller instantiate the fields in dword3 itself.
2612 */
2613 void brw_urb_WRITE(struct brw_codegen *p,
2614 struct brw_reg dest,
2615 unsigned msg_reg_nr,
2616 struct brw_reg src0,
2617 enum brw_urb_write_flags flags,
2618 unsigned msg_length,
2619 unsigned response_length,
2620 unsigned offset,
2621 unsigned swizzle)
2622 {
2623 const struct gen_device_info *devinfo = p->devinfo;
2624 brw_inst *insn;
2625
2626 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2627
2628 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2629 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2630 brw_push_insn_state(p);
2631 brw_set_default_access_mode(p, BRW_ALIGN_1);
2632 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2633 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2634 BRW_REGISTER_TYPE_UD),
2635 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2636 brw_imm_ud(0xff00));
2637 brw_pop_insn_state(p);
2638 }
2639
2640 insn = next_insn(p, BRW_OPCODE_SEND);
2641
2642 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2643
2644 brw_set_dest(p, insn, dest);
2645 brw_set_src0(p, insn, src0);
2646 brw_set_src1(p, insn, brw_imm_d(0));
2647
2648 if (devinfo->gen < 6)
2649 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2650
2651 brw_set_urb_message(p,
2652 insn,
2653 flags,
2654 msg_length,
2655 response_length,
2656 offset,
2657 swizzle);
2658 }
2659
2660 struct brw_inst *
2661 brw_send_indirect_message(struct brw_codegen *p,
2662 unsigned sfid,
2663 struct brw_reg dst,
2664 struct brw_reg payload,
2665 struct brw_reg desc)
2666 {
2667 const struct gen_device_info *devinfo = p->devinfo;
2668 struct brw_inst *send;
2669 int setup;
2670
2671 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2672
2673 assert(desc.type == BRW_REGISTER_TYPE_UD);
2674
2675 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2676 * in the indirect case) by its index in the instruction store. The
2677 * pointer returned by next_insn() may become invalid if emitting the SEND
2678 * in the indirect case reallocs the store.
2679 */
2680
2681 if (desc.file == BRW_IMMEDIATE_VALUE) {
2682 setup = p->nr_insn;
2683 send = next_insn(p, BRW_OPCODE_SEND);
2684 brw_set_src1(p, send, desc);
2685
2686 } else {
2687 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2688
2689 brw_push_insn_state(p);
2690 brw_set_default_access_mode(p, BRW_ALIGN_1);
2691 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2692 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2693
2694 /* Load the indirect descriptor to an address register using OR so the
2695 * caller can specify additional descriptor bits with the usual
2696 * brw_set_*_message() helper functions.
2697 */
2698 setup = p->nr_insn;
2699 brw_OR(p, addr, desc, brw_imm_ud(0));
2700
2701 brw_pop_insn_state(p);
2702
2703 send = next_insn(p, BRW_OPCODE_SEND);
2704 brw_set_src1(p, send, addr);
2705 }
2706
2707 if (dst.width < BRW_EXECUTE_8)
2708 brw_inst_set_exec_size(devinfo, send, dst.width);
2709
2710 brw_set_dest(p, send, dst);
2711 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2712 brw_inst_set_sfid(devinfo, send, sfid);
2713
2714 return &p->store[setup];
2715 }
2716
2717 static struct brw_inst *
2718 brw_send_indirect_surface_message(struct brw_codegen *p,
2719 unsigned sfid,
2720 struct brw_reg dst,
2721 struct brw_reg payload,
2722 struct brw_reg surface,
2723 unsigned message_len,
2724 unsigned response_len,
2725 bool header_present)
2726 {
2727 const struct gen_device_info *devinfo = p->devinfo;
2728 struct brw_inst *insn;
2729
2730 if (surface.file != BRW_IMMEDIATE_VALUE) {
2731 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2732
2733 brw_push_insn_state(p);
2734 brw_set_default_access_mode(p, BRW_ALIGN_1);
2735 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2736 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2737
2738 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2739 * some surface array is accessed out of bounds.
2740 */
2741 insn = brw_AND(p, addr,
2742 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2743 BRW_GET_SWZ(surface.swizzle, 0)),
2744 brw_imm_ud(0xff));
2745
2746 brw_pop_insn_state(p);
2747
2748 surface = addr;
2749 }
2750
2751 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2752 brw_inst_set_mlen(devinfo, insn, message_len);
2753 brw_inst_set_rlen(devinfo, insn, response_len);
2754 brw_inst_set_header_present(devinfo, insn, header_present);
2755
2756 return insn;
2757 }
2758
2759 static bool
2760 while_jumps_before_offset(const struct gen_device_info *devinfo,
2761 brw_inst *insn, int while_offset, int start_offset)
2762 {
2763 int scale = 16 / brw_jump_scale(devinfo);
2764 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2765 : brw_inst_jip(devinfo, insn);
2766 assert(jip < 0);
2767 return while_offset + jip * scale <= start_offset;
2768 }
2769
2770
2771 static int
2772 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2773 {
2774 int offset;
2775 void *store = p->store;
2776 const struct gen_device_info *devinfo = p->devinfo;
2777
2778 int depth = 0;
2779
2780 for (offset = next_offset(devinfo, store, start_offset);
2781 offset < p->next_insn_offset;
2782 offset = next_offset(devinfo, store, offset)) {
2783 brw_inst *insn = store + offset;
2784
2785 switch (brw_inst_opcode(devinfo, insn)) {
2786 case BRW_OPCODE_IF:
2787 depth++;
2788 break;
2789 case BRW_OPCODE_ENDIF:
2790 if (depth == 0)
2791 return offset;
2792 depth--;
2793 break;
2794 case BRW_OPCODE_WHILE:
2795 /* If the while doesn't jump before our instruction, it's the end
2796 * of a sibling do...while loop. Ignore it.
2797 */
2798 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2799 continue;
2800 /* fallthrough */
2801 case BRW_OPCODE_ELSE:
2802 case BRW_OPCODE_HALT:
2803 if (depth == 0)
2804 return offset;
2805 }
2806 }
2807
2808 return 0;
2809 }
2810
2811 /* There is no DO instruction on gen6, so to find the end of the loop
2812 * we have to see if the loop is jumping back before our start
2813 * instruction.
2814 */
2815 static int
2816 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2817 {
2818 const struct gen_device_info *devinfo = p->devinfo;
2819 int offset;
2820 void *store = p->store;
2821
2822 assert(devinfo->gen >= 6);
2823
2824 /* Always start after the instruction (such as a WHILE) we're trying to fix
2825 * up.
2826 */
2827 for (offset = next_offset(devinfo, store, start_offset);
2828 offset < p->next_insn_offset;
2829 offset = next_offset(devinfo, store, offset)) {
2830 brw_inst *insn = store + offset;
2831
2832 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2833 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2834 return offset;
2835 }
2836 }
2837 assert(!"not reached");
2838 return start_offset;
2839 }
2840
2841 /* After program generation, go back and update the UIP and JIP of
2842 * BREAK, CONT, and HALT instructions to their correct locations.
2843 */
2844 void
2845 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2846 {
2847 const struct gen_device_info *devinfo = p->devinfo;
2848 int offset;
2849 int br = brw_jump_scale(devinfo);
2850 int scale = 16 / br;
2851 void *store = p->store;
2852
2853 if (devinfo->gen < 6)
2854 return;
2855
2856 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2857 brw_inst *insn = store + offset;
2858 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2859
2860 int block_end_offset = brw_find_next_block_end(p, offset);
2861 switch (brw_inst_opcode(devinfo, insn)) {
2862 case BRW_OPCODE_BREAK:
2863 assert(block_end_offset != 0);
2864 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2865 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2866 brw_inst_set_uip(devinfo, insn,
2867 (brw_find_loop_end(p, offset) - offset +
2868 (devinfo->gen == 6 ? 16 : 0)) / scale);
2869 break;
2870 case BRW_OPCODE_CONTINUE:
2871 assert(block_end_offset != 0);
2872 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2873 brw_inst_set_uip(devinfo, insn,
2874 (brw_find_loop_end(p, offset) - offset) / scale);
2875
2876 assert(brw_inst_uip(devinfo, insn) != 0);
2877 assert(brw_inst_jip(devinfo, insn) != 0);
2878 break;
2879
2880 case BRW_OPCODE_ENDIF: {
2881 int32_t jump = (block_end_offset == 0) ?
2882 1 * br : (block_end_offset - offset) / scale;
2883 if (devinfo->gen >= 7)
2884 brw_inst_set_jip(devinfo, insn, jump);
2885 else
2886 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2887 break;
2888 }
2889
2890 case BRW_OPCODE_HALT:
2891 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2892 *
2893 * "In case of the halt instruction not inside any conditional
2894 * code block, the value of <JIP> and <UIP> should be the
2895 * same. In case of the halt instruction inside conditional code
2896 * block, the <UIP> should be the end of the program, and the
2897 * <JIP> should be end of the most inner conditional code block."
2898 *
2899 * The uip will have already been set by whoever set up the
2900 * instruction.
2901 */
2902 if (block_end_offset == 0) {
2903 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2904 } else {
2905 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2906 }
2907 assert(brw_inst_uip(devinfo, insn) != 0);
2908 assert(brw_inst_jip(devinfo, insn) != 0);
2909 break;
2910 }
2911 }
2912 }
2913
2914 void brw_ff_sync(struct brw_codegen *p,
2915 struct brw_reg dest,
2916 unsigned msg_reg_nr,
2917 struct brw_reg src0,
2918 bool allocate,
2919 unsigned response_length,
2920 bool eot)
2921 {
2922 const struct gen_device_info *devinfo = p->devinfo;
2923 brw_inst *insn;
2924
2925 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2926
2927 insn = next_insn(p, BRW_OPCODE_SEND);
2928 brw_set_dest(p, insn, dest);
2929 brw_set_src0(p, insn, src0);
2930 brw_set_src1(p, insn, brw_imm_d(0));
2931
2932 if (devinfo->gen < 6)
2933 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2934
2935 brw_set_ff_sync_message(p,
2936 insn,
2937 allocate,
2938 response_length,
2939 eot);
2940 }
2941
2942 /**
2943 * Emit the SEND instruction necessary to generate stream output data on Gen6
2944 * (for transform feedback).
2945 *
2946 * If send_commit_msg is true, this is the last piece of stream output data
2947 * from this thread, so send the data as a committed write. According to the
2948 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2949 *
2950 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2951 * writes are complete by sending the final write as a committed write."
2952 */
2953 void
2954 brw_svb_write(struct brw_codegen *p,
2955 struct brw_reg dest,
2956 unsigned msg_reg_nr,
2957 struct brw_reg src0,
2958 unsigned binding_table_index,
2959 bool send_commit_msg)
2960 {
2961 const struct gen_device_info *devinfo = p->devinfo;
2962 const unsigned target_cache =
2963 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2964 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2965 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2966 brw_inst *insn;
2967
2968 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2969
2970 insn = next_insn(p, BRW_OPCODE_SEND);
2971 brw_set_dest(p, insn, dest);
2972 brw_set_src0(p, insn, src0);
2973 brw_set_src1(p, insn, brw_imm_d(0));
2974 brw_set_dp_write_message(p, insn,
2975 binding_table_index,
2976 0, /* msg_control: ignored */
2977 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2978 target_cache,
2979 1, /* msg_length */
2980 true, /* header_present */
2981 0, /* last_render_target: ignored */
2982 send_commit_msg, /* response_length */
2983 0, /* end_of_thread */
2984 send_commit_msg); /* send_commit_msg */
2985 }
2986
2987 static unsigned
2988 brw_surface_payload_size(struct brw_codegen *p,
2989 unsigned num_channels,
2990 bool has_simd4x2,
2991 bool has_simd16)
2992 {
2993 if (has_simd4x2 &&
2994 brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2995 return 1;
2996 else if (has_simd16 &&
2997 brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2998 return 2 * num_channels;
2999 else
3000 return num_channels;
3001 }
3002
3003 static void
3004 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
3005 brw_inst *insn,
3006 unsigned atomic_op,
3007 bool response_expected)
3008 {
3009 const struct gen_device_info *devinfo = p->devinfo;
3010 unsigned msg_control =
3011 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3012 (response_expected ? 1 << 5 : 0); /* Return data expected */
3013
3014 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3015 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3016 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
3017 msg_control |= 1 << 4; /* SIMD8 mode */
3018
3019 brw_inst_set_dp_msg_type(devinfo, insn,
3020 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
3021 } else {
3022 brw_inst_set_dp_msg_type(devinfo, insn,
3023 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
3024 }
3025 } else {
3026 brw_inst_set_dp_msg_type(devinfo, insn,
3027 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
3028
3029 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
3030 msg_control |= 1 << 4; /* SIMD8 mode */
3031 }
3032
3033 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3034 }
3035
3036 void
3037 brw_untyped_atomic(struct brw_codegen *p,
3038 struct brw_reg dst,
3039 struct brw_reg payload,
3040 struct brw_reg surface,
3041 unsigned atomic_op,
3042 unsigned msg_length,
3043 bool response_expected)
3044 {
3045 const struct gen_device_info *devinfo = p->devinfo;
3046 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3047 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3048 GEN7_SFID_DATAPORT_DATA_CACHE);
3049 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3050 /* Mask out unused components -- This is especially important in Align16
3051 * mode on generations that don't have native support for SIMD4x2 atomics,
3052 * because unused but enabled components will cause the dataport to perform
3053 * additional atomic operations on the addresses that happen to be in the
3054 * uninitialized Y, Z and W coordinates of the payload.
3055 */
3056 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3057 struct brw_inst *insn = brw_send_indirect_surface_message(
3058 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3059 brw_surface_payload_size(p, response_expected,
3060 devinfo->gen >= 8 || devinfo->is_haswell, true),
3061 align1);
3062
3063 brw_set_dp_untyped_atomic_message(
3064 p, insn, atomic_op, response_expected);
3065 }
3066
3067 static void
3068 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
3069 struct brw_inst *insn,
3070 unsigned num_channels)
3071 {
3072 const struct gen_device_info *devinfo = p->devinfo;
3073 /* Set mask of 32-bit channels to drop. */
3074 unsigned msg_control = 0xf & (0xf << num_channels);
3075
3076 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3077 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3078 msg_control |= 1 << 4; /* SIMD16 mode */
3079 else
3080 msg_control |= 2 << 4; /* SIMD8 mode */
3081 }
3082
3083 brw_inst_set_dp_msg_type(devinfo, insn,
3084 (devinfo->gen >= 8 || devinfo->is_haswell ?
3085 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
3086 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
3087 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3088 }
3089
3090 void
3091 brw_untyped_surface_read(struct brw_codegen *p,
3092 struct brw_reg dst,
3093 struct brw_reg payload,
3094 struct brw_reg surface,
3095 unsigned msg_length,
3096 unsigned num_channels)
3097 {
3098 const struct gen_device_info *devinfo = p->devinfo;
3099 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3100 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3101 GEN7_SFID_DATAPORT_DATA_CACHE);
3102 struct brw_inst *insn = brw_send_indirect_surface_message(
3103 p, sfid, dst, payload, surface, msg_length,
3104 brw_surface_payload_size(p, num_channels, true, true),
3105 false);
3106
3107 brw_set_dp_untyped_surface_read_message(
3108 p, insn, num_channels);
3109 }
3110
3111 static void
3112 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3113 struct brw_inst *insn,
3114 unsigned num_channels)
3115 {
3116 const struct gen_device_info *devinfo = p->devinfo;
3117 /* Set mask of 32-bit channels to drop. */
3118 unsigned msg_control = 0xf & (0xf << num_channels);
3119
3120 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3121 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3122 msg_control |= 1 << 4; /* SIMD16 mode */
3123 else
3124 msg_control |= 2 << 4; /* SIMD8 mode */
3125 } else {
3126 if (devinfo->gen >= 8 || devinfo->is_haswell)
3127 msg_control |= 0 << 4; /* SIMD4x2 mode */
3128 else
3129 msg_control |= 2 << 4; /* SIMD8 mode */
3130 }
3131
3132 brw_inst_set_dp_msg_type(devinfo, insn,
3133 devinfo->gen >= 8 || devinfo->is_haswell ?
3134 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3135 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3136 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3137 }
3138
3139 void
3140 brw_untyped_surface_write(struct brw_codegen *p,
3141 struct brw_reg payload,
3142 struct brw_reg surface,
3143 unsigned msg_length,
3144 unsigned num_channels)
3145 {
3146 const struct gen_device_info *devinfo = p->devinfo;
3147 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3148 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3149 GEN7_SFID_DATAPORT_DATA_CACHE);
3150 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3151 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3152 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3153 WRITEMASK_X : WRITEMASK_XYZW;
3154 struct brw_inst *insn = brw_send_indirect_surface_message(
3155 p, sfid, brw_writemask(brw_null_reg(), mask),
3156 payload, surface, msg_length, 0, align1);
3157
3158 brw_set_dp_untyped_surface_write_message(
3159 p, insn, num_channels);
3160 }
3161
3162 static void
3163 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3164 struct brw_inst *insn,
3165 unsigned atomic_op,
3166 bool response_expected)
3167 {
3168 const struct gen_device_info *devinfo = p->devinfo;
3169 unsigned msg_control =
3170 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3171 (response_expected ? 1 << 5 : 0); /* Return data expected */
3172
3173 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3174 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3175 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3176 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3177
3178 brw_inst_set_dp_msg_type(devinfo, insn,
3179 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3180 } else {
3181 brw_inst_set_dp_msg_type(devinfo, insn,
3182 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3183 }
3184
3185 } else {
3186 brw_inst_set_dp_msg_type(devinfo, insn,
3187 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3188
3189 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3190 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3191 }
3192
3193 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3194 }
3195
3196 void
3197 brw_typed_atomic(struct brw_codegen *p,
3198 struct brw_reg dst,
3199 struct brw_reg payload,
3200 struct brw_reg surface,
3201 unsigned atomic_op,
3202 unsigned msg_length,
3203 bool response_expected) {
3204 const struct gen_device_info *devinfo = p->devinfo;
3205 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3206 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3207 GEN6_SFID_DATAPORT_RENDER_CACHE);
3208 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3209 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3210 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3211 struct brw_inst *insn = brw_send_indirect_surface_message(
3212 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3213 brw_surface_payload_size(p, response_expected,
3214 devinfo->gen >= 8 || devinfo->is_haswell, false),
3215 true);
3216
3217 brw_set_dp_typed_atomic_message(
3218 p, insn, atomic_op, response_expected);
3219 }
3220
3221 static void
3222 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3223 struct brw_inst *insn,
3224 unsigned num_channels)
3225 {
3226 const struct gen_device_info *devinfo = p->devinfo;
3227 /* Set mask of unused channels. */
3228 unsigned msg_control = 0xf & (0xf << num_channels);
3229
3230 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3231 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3232 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3233 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3234 else
3235 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3236 }
3237
3238 brw_inst_set_dp_msg_type(devinfo, insn,
3239 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3240 } else {
3241 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3242 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3243 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3244 }
3245
3246 brw_inst_set_dp_msg_type(devinfo, insn,
3247 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3248 }
3249
3250 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3251 }
3252
3253 void
3254 brw_typed_surface_read(struct brw_codegen *p,
3255 struct brw_reg dst,
3256 struct brw_reg payload,
3257 struct brw_reg surface,
3258 unsigned msg_length,
3259 unsigned num_channels)
3260 {
3261 const struct gen_device_info *devinfo = p->devinfo;
3262 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3263 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3264 GEN6_SFID_DATAPORT_RENDER_CACHE);
3265 struct brw_inst *insn = brw_send_indirect_surface_message(
3266 p, sfid, dst, payload, surface, msg_length,
3267 brw_surface_payload_size(p, num_channels,
3268 devinfo->gen >= 8 || devinfo->is_haswell, false),
3269 true);
3270
3271 brw_set_dp_typed_surface_read_message(
3272 p, insn, num_channels);
3273 }
3274
3275 static void
3276 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3277 struct brw_inst *insn,
3278 unsigned num_channels)
3279 {
3280 const struct gen_device_info *devinfo = p->devinfo;
3281 /* Set mask of unused channels. */
3282 unsigned msg_control = 0xf & (0xf << num_channels);
3283
3284 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3285 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3286 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3287 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3288 else
3289 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3290 }
3291
3292 brw_inst_set_dp_msg_type(devinfo, insn,
3293 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3294
3295 } else {
3296 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3297 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3298 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3299 }
3300
3301 brw_inst_set_dp_msg_type(devinfo, insn,
3302 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3303 }
3304
3305 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3306 }
3307
3308 void
3309 brw_typed_surface_write(struct brw_codegen *p,
3310 struct brw_reg payload,
3311 struct brw_reg surface,
3312 unsigned msg_length,
3313 unsigned num_channels)
3314 {
3315 const struct gen_device_info *devinfo = p->devinfo;
3316 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3317 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3318 GEN6_SFID_DATAPORT_RENDER_CACHE);
3319 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3320 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3321 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3322 WRITEMASK_X : WRITEMASK_XYZW);
3323 struct brw_inst *insn = brw_send_indirect_surface_message(
3324 p, sfid, brw_writemask(brw_null_reg(), mask),
3325 payload, surface, msg_length, 0, true);
3326
3327 brw_set_dp_typed_surface_write_message(
3328 p, insn, num_channels);
3329 }
3330
3331 static void
3332 brw_set_memory_fence_message(struct brw_codegen *p,
3333 struct brw_inst *insn,
3334 enum brw_message_target sfid,
3335 bool commit_enable)
3336 {
3337 const struct gen_device_info *devinfo = p->devinfo;
3338
3339 brw_set_message_descriptor(p, insn, sfid,
3340 1 /* message length */,
3341 (commit_enable ? 1 : 0) /* response length */,
3342 true /* header present */,
3343 false);
3344
3345 switch (sfid) {
3346 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3347 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3348 break;
3349 case GEN7_SFID_DATAPORT_DATA_CACHE:
3350 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3351 break;
3352 default:
3353 unreachable("Not reached");
3354 }
3355
3356 if (commit_enable)
3357 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3358 }
3359
3360 void
3361 brw_memory_fence(struct brw_codegen *p,
3362 struct brw_reg dst)
3363 {
3364 const struct gen_device_info *devinfo = p->devinfo;
3365 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3366 struct brw_inst *insn;
3367
3368 brw_push_insn_state(p);
3369 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3370 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3371 dst = vec1(dst);
3372
3373 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3374 * message doesn't write anything back.
3375 */
3376 insn = next_insn(p, BRW_OPCODE_SEND);
3377 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3378 brw_set_dest(p, insn, dst);
3379 brw_set_src0(p, insn, dst);
3380 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3381 commit_enable);
3382
3383 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3384 /* IVB does typed surface access through the render cache, so we need to
3385 * flush it too. Use a different register so both flushes can be
3386 * pipelined by the hardware.
3387 */
3388 insn = next_insn(p, BRW_OPCODE_SEND);
3389 brw_set_dest(p, insn, offset(dst, 1));
3390 brw_set_src0(p, insn, offset(dst, 1));
3391 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3392 commit_enable);
3393
3394 /* Now write the response of the second message into the response of the
3395 * first to trigger a pipeline stall -- This way future render and data
3396 * cache messages will be properly ordered with respect to past data and
3397 * render cache messages.
3398 */
3399 brw_MOV(p, dst, offset(dst, 1));
3400 }
3401
3402 brw_pop_insn_state(p);
3403 }
3404
3405 void
3406 brw_pixel_interpolator_query(struct brw_codegen *p,
3407 struct brw_reg dest,
3408 struct brw_reg mrf,
3409 bool noperspective,
3410 unsigned mode,
3411 struct brw_reg data,
3412 unsigned msg_length,
3413 unsigned response_length)
3414 {
3415 const struct gen_device_info *devinfo = p->devinfo;
3416 struct brw_inst *insn;
3417 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3418
3419 /* brw_send_indirect_message will automatically use a direct send message
3420 * if data is actually immediate.
3421 */
3422 insn = brw_send_indirect_message(p,
3423 GEN7_SFID_PIXEL_INTERPOLATOR,
3424 dest,
3425 mrf,
3426 vec1(data));
3427 brw_inst_set_mlen(devinfo, insn, msg_length);
3428 brw_inst_set_rlen(devinfo, insn, response_length);
3429
3430 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3431 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3432 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3433 brw_inst_set_pi_message_type(devinfo, insn, mode);
3434 }
3435
3436 void
3437 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3438 struct brw_reg mask)
3439 {
3440 const struct gen_device_info *devinfo = p->devinfo;
3441 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3442 const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3443 brw_inst *inst;
3444
3445 assert(devinfo->gen >= 7);
3446 assert(mask.type == BRW_REGISTER_TYPE_UD);
3447
3448 brw_push_insn_state(p);
3449
3450 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3451 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3452
3453 if (devinfo->gen >= 8) {
3454 /* Getting the first active channel index is easy on Gen8: Just find
3455 * the first bit set in the execution mask. The register exists on
3456 * HSW already but it reads back as all ones when the current
3457 * instruction has execution masking disabled, so it's kind of
3458 * useless.
3459 */
3460 struct brw_reg exec_mask =
3461 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3462
3463 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3464 /* Unfortunately, ce0 does not take into account the thread
3465 * dispatch mask, which may be a problem in cases where it's not
3466 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3467 * some n). Combine ce0 with the given dispatch (or vector) mask
3468 * to mask off those channels which were never dispatched by the
3469 * hardware.
3470 */
3471 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3472 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3473 exec_mask = vec1(dst);
3474 }
3475
3476 /* Quarter control has the effect of magically shifting the value of
3477 * ce0 so you'll get the first active channel relative to the
3478 * specified quarter control as result.
3479 */
3480 inst = brw_FBL(p, vec1(dst), exec_mask);
3481 } else {
3482 const struct brw_reg flag = brw_flag_reg(1, 0);
3483
3484 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3485
3486 /* Run enough instructions returning zero with execution masking and
3487 * a conditional modifier enabled in order to get the full execution
3488 * mask in f1.0. We could use a single 32-wide move here if it
3489 * weren't because of the hardware bug that causes channel enables to
3490 * be applied incorrectly to the second half of 32-wide instructions
3491 * on Gen7.
3492 */
3493 const unsigned lower_size = MIN2(16, exec_size);
3494 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3495 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3496 brw_imm_uw(0));
3497 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3498 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3499 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3500 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3501 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3502 }
3503
3504 /* Find the first bit set in the exec_size-wide portion of the flag
3505 * register that was updated by the last sequence of MOV
3506 * instructions.
3507 */
3508 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3509 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3510 }
3511 } else {
3512 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3513
3514 if (devinfo->gen >= 8 &&
3515 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3516 /* In SIMD4x2 mode the first active channel index is just the
3517 * negation of the first bit of the mask register. Note that ce0
3518 * doesn't take into account the dispatch mask, so the Gen7 path
3519 * should be used instead unless you have the guarantee that the
3520 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3521 * for some n).
3522 */
3523 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3524 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3525 brw_imm_ud(1));
3526
3527 } else {
3528 /* Overwrite the destination without and with execution masking to
3529 * find out which of the channels is active.
3530 */
3531 brw_push_insn_state(p);
3532 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3533 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3534 brw_imm_ud(1));
3535
3536 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3537 brw_imm_ud(0));
3538 brw_pop_insn_state(p);
3539 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3540 }
3541 }
3542
3543 brw_pop_insn_state(p);
3544 }
3545
3546 void
3547 brw_broadcast(struct brw_codegen *p,
3548 struct brw_reg dst,
3549 struct brw_reg src,
3550 struct brw_reg idx)
3551 {
3552 const struct gen_device_info *devinfo = p->devinfo;
3553 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3554 brw_inst *inst;
3555
3556 brw_push_insn_state(p);
3557 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3558 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3559
3560 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3561 src.address_mode == BRW_ADDRESS_DIRECT);
3562
3563 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3564 idx.file == BRW_IMMEDIATE_VALUE) {
3565 /* Trivial, the source is already uniform or the index is a constant.
3566 * We will typically not get here if the optimizer is doing its job, but
3567 * asserting would be mean.
3568 */
3569 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3570 brw_MOV(p, dst,
3571 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3572 stride(suboffset(src, 4 * i), 0, 4, 1)));
3573 } else {
3574 if (align1) {
3575 const struct brw_reg addr =
3576 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3577 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3578 /* Limit in bytes of the signed indirect addressing immediate. */
3579 const unsigned limit = 512;
3580
3581 brw_push_insn_state(p);
3582 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3583 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3584
3585 /* Take into account the component size and horizontal stride. */
3586 assert(src.vstride == src.hstride + src.width);
3587 brw_SHL(p, addr, vec1(idx),
3588 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3589 src.hstride - 1));
3590
3591 /* We can only address up to limit bytes using the indirect
3592 * addressing immediate, account for the difference if the source
3593 * register is above this limit.
3594 */
3595 if (offset >= limit)
3596 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3597
3598 brw_pop_insn_state(p);
3599
3600 /* Use indirect addressing to fetch the specified component. */
3601 brw_MOV(p, dst,
3602 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3603 src.type));
3604 } else {
3605 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3606 * to all bits of a flag register,
3607 */
3608 inst = brw_MOV(p,
3609 brw_null_reg(),
3610 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3611 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3612 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3613 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3614
3615 /* and use predicated SEL to pick the right channel. */
3616 inst = brw_SEL(p, dst,
3617 stride(suboffset(src, 4), 4, 4, 1),
3618 stride(src, 4, 4, 1));
3619 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3620 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3621 }
3622 }
3623
3624 brw_pop_insn_state(p);
3625 }
3626
3627 /**
3628 * This instruction is generated as a single-channel align1 instruction by
3629 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3630 *
3631 * We can't use the typed atomic op in the FS because that has the execution
3632 * mask ANDed with the pixel mask, but we just want to write the one dword for
3633 * all the pixels.
3634 *
3635 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3636 * one u32. So we use the same untyped atomic write message as the pixel
3637 * shader.
3638 *
3639 * The untyped atomic operation requires a BUFFER surface type with RAW
3640 * format, and is only accessible through the legacy DATA_CACHE dataport
3641 * messages.
3642 */
3643 void brw_shader_time_add(struct brw_codegen *p,
3644 struct brw_reg payload,
3645 uint32_t surf_index)
3646 {
3647 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3648 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3649 GEN7_SFID_DATAPORT_DATA_CACHE);
3650 assert(p->devinfo->gen >= 7);
3651
3652 brw_push_insn_state(p);
3653 brw_set_default_access_mode(p, BRW_ALIGN_1);
3654 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3655 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3656 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3657
3658 /* We use brw_vec1_reg and unmasked because we want to increment the given
3659 * offset only once.
3660 */
3661 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3662 BRW_ARF_NULL, 0));
3663 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3664 payload.nr, 0));
3665 brw_set_src1(p, send, brw_imm_ud(0));
3666 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3667 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3668 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3669
3670 brw_pop_insn_state(p);
3671 }
3672
3673
3674 /**
3675 * Emit the SEND message for a barrier
3676 */
3677 void
3678 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3679 {
3680 const struct gen_device_info *devinfo = p->devinfo;
3681 struct brw_inst *inst;
3682
3683 assert(devinfo->gen >= 7);
3684
3685 brw_push_insn_state(p);
3686 brw_set_default_access_mode(p, BRW_ALIGN_1);
3687 inst = next_insn(p, BRW_OPCODE_SEND);
3688 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3689 brw_set_src0(p, inst, src);
3690 brw_set_src1(p, inst, brw_null_reg());
3691
3692 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3693 1 /* msg_length */,
3694 0 /* response_length */,
3695 false /* header_present */,
3696 false /* end_of_thread */);
3697
3698 brw_inst_set_gateway_notify(devinfo, inst, 1);
3699 brw_inst_set_gateway_subfuncid(devinfo, inst,
3700 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3701
3702 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3703 brw_pop_insn_state(p);
3704 }
3705
3706
3707 /**
3708 * Emit the wait instruction for a barrier
3709 */
3710 void
3711 brw_WAIT(struct brw_codegen *p)
3712 {
3713 const struct gen_device_info *devinfo = p->devinfo;
3714 struct brw_inst *insn;
3715
3716 struct brw_reg src = brw_notification_reg();
3717
3718 insn = next_insn(p, BRW_OPCODE_WAIT);
3719 brw_set_dest(p, insn, src);
3720 brw_set_src0(p, insn, src);
3721 brw_set_src1(p, insn, brw_null_reg());
3722
3723 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3724 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3725 }