i965: update gl_PrimitiveIDIn to be a system variable
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "util/ralloc.h"
38
39 /**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case. This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46 void
47 gen6_resolve_implied_move(struct brw_codegen *p,
48 struct brw_reg *src,
49 unsigned msg_reg_nr)
50 {
51 const struct gen_device_info *devinfo = p->devinfo;
52 if (devinfo->gen < 6)
53 return;
54
55 if (src->file == BRW_MESSAGE_REGISTER_FILE)
56 return;
57
58 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct gen_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 /**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93 unsigned
94 brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
95 enum brw_reg_type type, enum brw_reg_file file)
96 {
97 if (file == BRW_IMMEDIATE_VALUE) {
98 static const int imm_hw_types[] = {
99 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
101 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
103 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
104 [BRW_REGISTER_TYPE_UB] = -1,
105 [BRW_REGISTER_TYPE_B] = -1,
106 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
109 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
113 };
114 assert(type < ARRAY_SIZE(imm_hw_types));
115 assert(imm_hw_types[type] != -1);
116 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117 return imm_hw_types[type];
118 } else {
119 /* Non-immediate registers */
120 static const int hw_types[] = {
121 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
123 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
125 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
127 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
128 [BRW_REGISTER_TYPE_UV] = -1,
129 [BRW_REGISTER_TYPE_VF] = -1,
130 [BRW_REGISTER_TYPE_V] = -1,
131 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
135 };
136 assert(type < ARRAY_SIZE(hw_types));
137 assert(hw_types[type] != -1);
138 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140 return hw_types[type];
141 }
142 }
143
144 void
145 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146 {
147 const struct gen_device_info *devinfo = p->devinfo;
148
149 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
150 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
151 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
152 assert(dest.nr < 128);
153
154 gen7_convert_mrf_to_grf(p, &dest);
155
156 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
157 brw_inst_set_dst_reg_type(devinfo, inst,
158 brw_reg_type_to_hw_type(devinfo, dest.type,
159 dest.file));
160 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
161
162 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
163 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
164
165 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
166 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
167 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
168 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
169 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
170 } else {
171 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
172 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
173 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
174 dest.file == BRW_MESSAGE_REGISTER_FILE) {
175 assert(dest.writemask != 0);
176 }
177 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
178 * Although Dst.HorzStride is a don't care for Align16, HW needs
179 * this to be programmed as "01".
180 */
181 brw_inst_set_dst_hstride(devinfo, inst, 1);
182 }
183 } else {
184 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
185
186 /* These are different sizes in align1 vs align16:
187 */
188 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
189 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
190 dest.indirect_offset);
191 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
192 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
193 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
194 } else {
195 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
196 dest.indirect_offset);
197 /* even ignored in da16, still need to set as '01' */
198 brw_inst_set_dst_hstride(devinfo, inst, 1);
199 }
200 }
201
202 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
203 * or 16 (SIMD16), as that's normally correct. However, when dealing with
204 * small registers, we automatically reduce it to match the register size.
205 *
206 * In platforms that support fp64 we can emit instructions with a width of
207 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
208 * cases we need to make sure that these instructions have their exec sizes
209 * set properly when they are emitted and we can't rely on this code to fix
210 * it.
211 */
212 bool fix_exec_size;
213 if (devinfo->gen >= 6)
214 fix_exec_size = dest.width < BRW_EXECUTE_4;
215 else
216 fix_exec_size = dest.width < BRW_EXECUTE_8;
217
218 if (fix_exec_size)
219 brw_inst_set_exec_size(devinfo, inst, dest.width);
220 }
221
222 extern int reg_type_size[];
223
224 static void
225 validate_reg(const struct gen_device_info *devinfo,
226 brw_inst *inst, struct brw_reg reg)
227 {
228 const int hstride_for_reg[] = {0, 1, 2, 4};
229 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
230 const int width_for_reg[] = {1, 2, 4, 8, 16};
231 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
232 int width, hstride, vstride, execsize;
233
234 if (reg.file == BRW_IMMEDIATE_VALUE) {
235 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
236 * mean the destination has to be 128-bit aligned and the
237 * destination horiz stride has to be a word.
238 */
239 if (reg.type == BRW_REGISTER_TYPE_V) {
240 assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
241 reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
242 }
243
244 return;
245 }
246
247 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
248 reg.file == BRW_ARF_NULL)
249 return;
250
251 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
252 *
253 * "Swizzling is not allowed when an accumulator is used as an implicit
254 * source or an explicit source in an instruction."
255 */
256 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
257 reg.nr == BRW_ARF_ACCUMULATOR)
258 assert(reg.swizzle == BRW_SWIZZLE_XYZW);
259
260 assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
261 hstride = hstride_for_reg[reg.hstride];
262
263 if (reg.vstride == 0xf) {
264 vstride = -1;
265 } else {
266 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
267 vstride = vstride_for_reg[reg.vstride];
268 }
269
270 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
271 width = width_for_reg[reg.width];
272
273 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
274 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
275 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
276
277 /* Restrictions from 3.3.10: Register Region Restrictions. */
278 /* 3. */
279 assert(execsize >= width);
280
281 /* 4. */
282 if (execsize == width && hstride != 0) {
283 assert(vstride == -1 || vstride == width * hstride);
284 }
285
286 /* 5. */
287 if (execsize == width && hstride == 0) {
288 /* no restriction on vstride. */
289 }
290
291 /* 6. */
292 if (width == 1) {
293 assert(hstride == 0);
294 }
295
296 /* 7. */
297 if (execsize == 1 && width == 1) {
298 assert(hstride == 0);
299 assert(vstride == 0);
300 }
301
302 /* 8. */
303 if (vstride == 0 && hstride == 0) {
304 assert(width == 1);
305 }
306
307 /* 10. Check destination issues. */
308 }
309
310 static bool
311 is_compactable_immediate(unsigned imm)
312 {
313 /* We get the low 12 bits as-is. */
314 imm &= ~0xfff;
315
316 /* We get one bit replicated through the top 20 bits. */
317 return imm == 0 || imm == 0xfffff000;
318 }
319
320 void
321 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
322 {
323 const struct gen_device_info *devinfo = p->devinfo;
324
325 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
326 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
327 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
328 assert(reg.nr < 128);
329
330 gen7_convert_mrf_to_grf(p, &reg);
331
332 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
333 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
334 /* Any source modifiers or regions will be ignored, since this just
335 * identifies the MRF/GRF to start reading the message contents from.
336 * Check for some likely failures.
337 */
338 assert(!reg.negate);
339 assert(!reg.abs);
340 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
341 }
342
343 validate_reg(devinfo, inst, reg);
344
345 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
346 brw_inst_set_src0_reg_type(devinfo, inst,
347 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
348 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
349 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
350 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
351
352 if (reg.file == BRW_IMMEDIATE_VALUE) {
353 if (reg.type == BRW_REGISTER_TYPE_DF ||
354 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
355 brw_inst_set_imm_df(devinfo, inst, reg.df);
356 else
357 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
358
359 /* The Bspec's section titled "Non-present Operands" claims that if src0
360 * is an immediate that src1's type must be the same as that of src0.
361 *
362 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
363 * that do not follow this rule. E.g., from the IVB/HSW table:
364 *
365 * DataTypeIndex 18-Bit Mapping Mapped Meaning
366 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
367 *
368 * And from the SNB table:
369 *
370 * DataTypeIndex 18-Bit Mapping Mapped Meaning
371 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
372 *
373 * Neither of these cause warnings from the simulator when used,
374 * compacted or otherwise. In fact, all compaction mappings that have an
375 * immediate in src0 use a:ud for src1.
376 *
377 * The GM45 instruction compaction tables do not contain mapped meanings
378 * so it's not clear whether it has the restriction. We'll assume it was
379 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
380 *
381 * Don't do any of this for 64-bit immediates, since the src1 fields
382 * overlap with the immediate and setting them would overwrite the
383 * immediate we set.
384 */
385 if (type_sz(reg.type) < 8) {
386 brw_inst_set_src1_reg_file(devinfo, inst,
387 BRW_ARCHITECTURE_REGISTER_FILE);
388 if (devinfo->gen < 6) {
389 brw_inst_set_src1_reg_type(devinfo, inst,
390 brw_inst_src0_reg_type(devinfo, inst));
391 } else {
392 brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
393 }
394 }
395
396 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
397 * for immediate values. Presumably the hardware engineers realized
398 * that the only useful floating-point value that could be represented
399 * in this format is 0.0, which can also be represented as a VF-typed
400 * immediate, so they gave us the previously mentioned mapping on IVB+.
401 *
402 * Strangely, we do have a mapping for imm:f in src1, so we don't need
403 * to do this there.
404 *
405 * If we see a 0.0:F, change the type to VF so that it can be compacted.
406 */
407 if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
408 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
409 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
410 }
411
412 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
413 * set the types to :UD so the instruction can be compacted.
414 */
415 if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
416 brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
417 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
418 brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
419 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
420 brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
421 }
422 } else {
423 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
424 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
425 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
426 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
427 } else {
428 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
429 }
430 } else {
431 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
432
433 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
434 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
435 } else {
436 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
437 }
438 }
439
440 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
441 if (reg.width == BRW_WIDTH_1 &&
442 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
443 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
444 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
445 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
446 } else {
447 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
448 brw_inst_set_src0_width(devinfo, inst, reg.width);
449 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
450 }
451 } else {
452 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
453 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
454 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
455 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
456 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
457 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
458 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
459 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
460
461 /* This is an oddity of the fact we're using the same
462 * descriptions for registers in align_16 as align_1:
463 */
464 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
465 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
466 else
467 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
468 }
469 }
470 }
471
472
473 void
474 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
475 {
476 const struct gen_device_info *devinfo = p->devinfo;
477
478 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
479 assert(reg.nr < 128);
480
481 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
482 *
483 * "Accumulator registers may be accessed explicitly as src0
484 * operands only."
485 */
486 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
487 reg.nr != BRW_ARF_ACCUMULATOR);
488
489 gen7_convert_mrf_to_grf(p, &reg);
490 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
491
492 validate_reg(devinfo, inst, reg);
493
494 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
495 brw_inst_set_src1_reg_type(devinfo, inst,
496 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
497 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
498 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
499
500 /* Only src1 can be immediate in two-argument instructions.
501 */
502 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
503
504 if (reg.file == BRW_IMMEDIATE_VALUE) {
505 /* two-argument instructions can only use 32-bit immediates */
506 assert(type_sz(reg.type) < 8);
507 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
508 } else {
509 /* This is a hardware restriction, which may or may not be lifted
510 * in the future:
511 */
512 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
513 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
514
515 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
516 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
517 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
518 } else {
519 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
520 }
521
522 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
523 if (reg.width == BRW_WIDTH_1 &&
524 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
525 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
526 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
527 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
528 } else {
529 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
530 brw_inst_set_src1_width(devinfo, inst, reg.width);
531 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
532 }
533 } else {
534 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
535 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
536 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
537 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
538 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
539 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
540 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
541 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
542
543 /* This is an oddity of the fact we're using the same
544 * descriptions for registers in align_16 as align_1:
545 */
546 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
547 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
548 else
549 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
550 }
551 }
552 }
553
554 /**
555 * Set the Message Descriptor and Extended Message Descriptor fields
556 * for SEND messages.
557 *
558 * \note This zeroes out the Function Control bits, so it must be called
559 * \b before filling out any message-specific data. Callers can
560 * choose not to fill in irrelevant bits; they will be zero.
561 */
562 void
563 brw_set_message_descriptor(struct brw_codegen *p,
564 brw_inst *inst,
565 enum brw_message_target sfid,
566 unsigned msg_length,
567 unsigned response_length,
568 bool header_present,
569 bool end_of_thread)
570 {
571 const struct gen_device_info *devinfo = p->devinfo;
572
573 brw_set_src1(p, inst, brw_imm_d(0));
574
575 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
576 * itself; instead, it will be a MOV/OR into the address register.
577 *
578 * In this case, we avoid setting the extended message descriptor bits,
579 * since they go on the later SEND/SENDC instead and if set here would
580 * instead clobber the conditionalmod bits.
581 */
582 unsigned opcode = brw_inst_opcode(devinfo, inst);
583 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
584 brw_inst_set_sfid(devinfo, inst, sfid);
585 }
586
587 brw_inst_set_mlen(devinfo, inst, msg_length);
588 brw_inst_set_rlen(devinfo, inst, response_length);
589 brw_inst_set_eot(devinfo, inst, end_of_thread);
590
591 if (devinfo->gen >= 5) {
592 brw_inst_set_header_present(devinfo, inst, header_present);
593 }
594 }
595
596 static void brw_set_math_message( struct brw_codegen *p,
597 brw_inst *inst,
598 unsigned function,
599 unsigned integer_type,
600 bool low_precision,
601 unsigned dataType )
602 {
603 const struct gen_device_info *devinfo = p->devinfo;
604 unsigned msg_length;
605 unsigned response_length;
606
607 /* Infer message length from the function */
608 switch (function) {
609 case BRW_MATH_FUNCTION_POW:
610 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
611 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
612 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
613 msg_length = 2;
614 break;
615 default:
616 msg_length = 1;
617 break;
618 }
619
620 /* Infer response length from the function */
621 switch (function) {
622 case BRW_MATH_FUNCTION_SINCOS:
623 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
624 response_length = 2;
625 break;
626 default:
627 response_length = 1;
628 break;
629 }
630
631
632 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
633 msg_length, response_length, false, false);
634 brw_inst_set_math_msg_function(devinfo, inst, function);
635 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
636 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
637 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
638 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
639 brw_inst_set_saturate(devinfo, inst, 0);
640 }
641
642
643 static void brw_set_ff_sync_message(struct brw_codegen *p,
644 brw_inst *insn,
645 bool allocate,
646 unsigned response_length,
647 bool end_of_thread)
648 {
649 const struct gen_device_info *devinfo = p->devinfo;
650
651 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
652 1, response_length, true, end_of_thread);
653 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
654 brw_inst_set_urb_allocate(devinfo, insn, allocate);
655 /* The following fields are not used by FF_SYNC: */
656 brw_inst_set_urb_global_offset(devinfo, insn, 0);
657 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
658 brw_inst_set_urb_used(devinfo, insn, 0);
659 brw_inst_set_urb_complete(devinfo, insn, 0);
660 }
661
662 static void brw_set_urb_message( struct brw_codegen *p,
663 brw_inst *insn,
664 enum brw_urb_write_flags flags,
665 unsigned msg_length,
666 unsigned response_length,
667 unsigned offset,
668 unsigned swizzle_control )
669 {
670 const struct gen_device_info *devinfo = p->devinfo;
671
672 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
673 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
674 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
675
676 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
677 msg_length, response_length, true,
678 flags & BRW_URB_WRITE_EOT);
679
680 if (flags & BRW_URB_WRITE_OWORD) {
681 assert(msg_length == 2); /* header + one OWORD of data */
682 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
683 } else {
684 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
685 }
686
687 brw_inst_set_urb_global_offset(devinfo, insn, offset);
688 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
689
690 if (devinfo->gen < 8) {
691 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
692 }
693
694 if (devinfo->gen < 7) {
695 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
696 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
697 } else {
698 brw_inst_set_urb_per_slot_offset(devinfo, insn,
699 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
700 }
701 }
702
703 void
704 brw_set_dp_write_message(struct brw_codegen *p,
705 brw_inst *insn,
706 unsigned binding_table_index,
707 unsigned msg_control,
708 unsigned msg_type,
709 unsigned msg_length,
710 bool header_present,
711 unsigned last_render_target,
712 unsigned response_length,
713 unsigned end_of_thread,
714 unsigned send_commit_msg)
715 {
716 const struct gen_device_info *devinfo = p->devinfo;
717 unsigned sfid;
718
719 if (devinfo->gen >= 7) {
720 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
721 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
722 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
723 else
724 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
725 } else if (devinfo->gen == 6) {
726 /* Use the render cache for all write messages. */
727 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
728 } else {
729 sfid = BRW_SFID_DATAPORT_WRITE;
730 }
731
732 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
733 header_present, end_of_thread);
734
735 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
736 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
737 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
738 brw_inst_set_rt_last(devinfo, insn, last_render_target);
739 if (devinfo->gen < 7) {
740 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
741 }
742 }
743
744 void
745 brw_set_dp_read_message(struct brw_codegen *p,
746 brw_inst *insn,
747 unsigned binding_table_index,
748 unsigned msg_control,
749 unsigned msg_type,
750 unsigned target_cache,
751 unsigned msg_length,
752 bool header_present,
753 unsigned response_length)
754 {
755 const struct gen_device_info *devinfo = p->devinfo;
756 unsigned sfid;
757
758 if (devinfo->gen >= 7) {
759 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
760 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
761 else if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
762 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
763 else if (target_cache == BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE)
764 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
765 else
766 unreachable("Invalid target cache");
767
768 } else if (devinfo->gen == 6) {
769 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
770 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
771 else
772 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
773 } else {
774 sfid = BRW_SFID_DATAPORT_READ;
775 }
776
777 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
778 header_present, false);
779
780 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
781 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
782 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
783 if (devinfo->gen < 6)
784 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
785 }
786
787 void
788 brw_set_sampler_message(struct brw_codegen *p,
789 brw_inst *inst,
790 unsigned binding_table_index,
791 unsigned sampler,
792 unsigned msg_type,
793 unsigned response_length,
794 unsigned msg_length,
795 unsigned header_present,
796 unsigned simd_mode,
797 unsigned return_format)
798 {
799 const struct gen_device_info *devinfo = p->devinfo;
800
801 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
802 response_length, header_present, false);
803
804 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
805 brw_inst_set_sampler(devinfo, inst, sampler);
806 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
807 if (devinfo->gen >= 5) {
808 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
809 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
810 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
811 }
812 }
813
814 static void
815 gen7_set_dp_scratch_message(struct brw_codegen *p,
816 brw_inst *inst,
817 bool write,
818 bool dword,
819 bool invalidate_after_read,
820 unsigned num_regs,
821 unsigned addr_offset,
822 unsigned mlen,
823 unsigned rlen,
824 bool header_present)
825 {
826 const struct gen_device_info *devinfo = p->devinfo;
827 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
828 (devinfo->gen >= 8 && num_regs == 8));
829 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
830 num_regs - 1);
831
832 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
833 mlen, rlen, header_present, false);
834 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
835 brw_inst_set_scratch_read_write(devinfo, inst, write);
836 brw_inst_set_scratch_type(devinfo, inst, dword);
837 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
838 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
839 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
840 }
841
842 #define next_insn brw_next_insn
843 brw_inst *
844 brw_next_insn(struct brw_codegen *p, unsigned opcode)
845 {
846 const struct gen_device_info *devinfo = p->devinfo;
847 brw_inst *insn;
848
849 if (p->nr_insn + 1 > p->store_size) {
850 p->store_size <<= 1;
851 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
852 }
853
854 p->next_insn_offset += 16;
855 insn = &p->store[p->nr_insn++];
856 memcpy(insn, p->current, sizeof(*insn));
857
858 brw_inst_set_opcode(devinfo, insn, opcode);
859 return insn;
860 }
861
862 static brw_inst *
863 brw_alu1(struct brw_codegen *p, unsigned opcode,
864 struct brw_reg dest, struct brw_reg src)
865 {
866 brw_inst *insn = next_insn(p, opcode);
867 brw_set_dest(p, insn, dest);
868 brw_set_src0(p, insn, src);
869 return insn;
870 }
871
872 static brw_inst *
873 brw_alu2(struct brw_codegen *p, unsigned opcode,
874 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
875 {
876 /* 64-bit immediates are only supported on 1-src instructions */
877 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
878 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
879
880 brw_inst *insn = next_insn(p, opcode);
881 brw_set_dest(p, insn, dest);
882 brw_set_src0(p, insn, src0);
883 brw_set_src1(p, insn, src1);
884 return insn;
885 }
886
887 static int
888 get_3src_subreg_nr(struct brw_reg reg)
889 {
890 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
891 * use 32-bit units (components 0..7). Since they only support F/D/UD
892 * types, this doesn't lose any flexibility, but uses fewer bits.
893 */
894 return reg.subnr / 4;
895 }
896
897 static brw_inst *
898 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
899 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
900 {
901 const struct gen_device_info *devinfo = p->devinfo;
902 brw_inst *inst = next_insn(p, opcode);
903
904 gen7_convert_mrf_to_grf(p, &dest);
905
906 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
907
908 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
909 dest.file == BRW_MESSAGE_REGISTER_FILE);
910 assert(dest.nr < 128);
911 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
912 assert(dest.type == BRW_REGISTER_TYPE_F ||
913 dest.type == BRW_REGISTER_TYPE_DF ||
914 dest.type == BRW_REGISTER_TYPE_D ||
915 dest.type == BRW_REGISTER_TYPE_UD);
916 if (devinfo->gen == 6) {
917 brw_inst_set_3src_dst_reg_file(devinfo, inst,
918 dest.file == BRW_MESSAGE_REGISTER_FILE);
919 }
920 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
921 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
922 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
923
924 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
925 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
926 assert(src0.nr < 128);
927 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
928 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
929 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
930 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
931 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
932 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
933 src0.vstride == BRW_VERTICAL_STRIDE_0);
934
935 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
936 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
937 assert(src1.nr < 128);
938 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
939 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
940 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
941 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
942 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
943 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
944 src1.vstride == BRW_VERTICAL_STRIDE_0);
945
946 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
947 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
948 assert(src2.nr < 128);
949 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
950 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
951 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
952 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
953 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
954 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
955 src2.vstride == BRW_VERTICAL_STRIDE_0);
956
957 if (devinfo->gen >= 7) {
958 /* Set both the source and destination types based on dest.type,
959 * ignoring the source register types. The MAD and LRP emitters ensure
960 * that all four types are float. The BFE and BFI2 emitters, however,
961 * may send us mixed D and UD types and want us to ignore that and use
962 * the destination type.
963 */
964 switch (dest.type) {
965 case BRW_REGISTER_TYPE_F:
966 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
967 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
968 break;
969 case BRW_REGISTER_TYPE_DF:
970 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
971 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
972 break;
973 case BRW_REGISTER_TYPE_D:
974 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
975 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
976 break;
977 case BRW_REGISTER_TYPE_UD:
978 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
979 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
980 break;
981 default:
982 unreachable("not reached");
983 }
984 }
985
986 return inst;
987 }
988
989
990 /***********************************************************************
991 * Convenience routines.
992 */
993 #define ALU1(OP) \
994 brw_inst *brw_##OP(struct brw_codegen *p, \
995 struct brw_reg dest, \
996 struct brw_reg src0) \
997 { \
998 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
999 }
1000
1001 #define ALU2(OP) \
1002 brw_inst *brw_##OP(struct brw_codegen *p, \
1003 struct brw_reg dest, \
1004 struct brw_reg src0, \
1005 struct brw_reg src1) \
1006 { \
1007 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1008 }
1009
1010 #define ALU3(OP) \
1011 brw_inst *brw_##OP(struct brw_codegen *p, \
1012 struct brw_reg dest, \
1013 struct brw_reg src0, \
1014 struct brw_reg src1, \
1015 struct brw_reg src2) \
1016 { \
1017 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1018 }
1019
1020 #define ALU3F(OP) \
1021 brw_inst *brw_##OP(struct brw_codegen *p, \
1022 struct brw_reg dest, \
1023 struct brw_reg src0, \
1024 struct brw_reg src1, \
1025 struct brw_reg src2) \
1026 { \
1027 assert(dest.type == BRW_REGISTER_TYPE_F || \
1028 dest.type == BRW_REGISTER_TYPE_DF); \
1029 if (dest.type == BRW_REGISTER_TYPE_F) { \
1030 assert(src0.type == BRW_REGISTER_TYPE_F); \
1031 assert(src1.type == BRW_REGISTER_TYPE_F); \
1032 assert(src2.type == BRW_REGISTER_TYPE_F); \
1033 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1034 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1035 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1036 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1037 } \
1038 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1039 }
1040
1041 /* Rounding operations (other than RNDD) require two instructions - the first
1042 * stores a rounded value (possibly the wrong way) in the dest register, but
1043 * also sets a per-channel "increment bit" in the flag register. A predicated
1044 * add of 1.0 fixes dest to contain the desired result.
1045 *
1046 * Sandybridge and later appear to round correctly without an ADD.
1047 */
1048 #define ROUND(OP) \
1049 void brw_##OP(struct brw_codegen *p, \
1050 struct brw_reg dest, \
1051 struct brw_reg src) \
1052 { \
1053 const struct gen_device_info *devinfo = p->devinfo; \
1054 brw_inst *rnd, *add; \
1055 rnd = next_insn(p, BRW_OPCODE_##OP); \
1056 brw_set_dest(p, rnd, dest); \
1057 brw_set_src0(p, rnd, src); \
1058 \
1059 if (devinfo->gen < 6) { \
1060 /* turn on round-increments */ \
1061 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1062 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1063 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1064 } \
1065 }
1066
1067
1068 ALU1(MOV)
1069 ALU2(SEL)
1070 ALU1(NOT)
1071 ALU2(AND)
1072 ALU2(OR)
1073 ALU2(XOR)
1074 ALU2(SHR)
1075 ALU2(SHL)
1076 ALU1(DIM)
1077 ALU2(ASR)
1078 ALU1(FRC)
1079 ALU1(RNDD)
1080 ALU2(MAC)
1081 ALU2(MACH)
1082 ALU1(LZD)
1083 ALU2(DP4)
1084 ALU2(DPH)
1085 ALU2(DP3)
1086 ALU2(DP2)
1087 ALU3F(MAD)
1088 ALU3F(LRP)
1089 ALU1(BFREV)
1090 ALU3(BFE)
1091 ALU2(BFI1)
1092 ALU3(BFI2)
1093 ALU1(FBH)
1094 ALU1(FBL)
1095 ALU1(CBIT)
1096 ALU2(ADDC)
1097 ALU2(SUBB)
1098
1099 ROUND(RNDZ)
1100 ROUND(RNDE)
1101
1102
1103 brw_inst *
1104 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1105 struct brw_reg src0, struct brw_reg src1)
1106 {
1107 /* 6.2.2: add */
1108 if (src0.type == BRW_REGISTER_TYPE_F ||
1109 (src0.file == BRW_IMMEDIATE_VALUE &&
1110 src0.type == BRW_REGISTER_TYPE_VF)) {
1111 assert(src1.type != BRW_REGISTER_TYPE_UD);
1112 assert(src1.type != BRW_REGISTER_TYPE_D);
1113 }
1114
1115 if (src1.type == BRW_REGISTER_TYPE_F ||
1116 (src1.file == BRW_IMMEDIATE_VALUE &&
1117 src1.type == BRW_REGISTER_TYPE_VF)) {
1118 assert(src0.type != BRW_REGISTER_TYPE_UD);
1119 assert(src0.type != BRW_REGISTER_TYPE_D);
1120 }
1121
1122 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1123 }
1124
1125 brw_inst *
1126 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1127 struct brw_reg src0, struct brw_reg src1)
1128 {
1129 assert(dest.type == src0.type);
1130 assert(src0.type == src1.type);
1131 switch (src0.type) {
1132 case BRW_REGISTER_TYPE_B:
1133 case BRW_REGISTER_TYPE_UB:
1134 case BRW_REGISTER_TYPE_W:
1135 case BRW_REGISTER_TYPE_UW:
1136 case BRW_REGISTER_TYPE_D:
1137 case BRW_REGISTER_TYPE_UD:
1138 break;
1139 default:
1140 unreachable("Bad type for brw_AVG");
1141 }
1142
1143 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1144 }
1145
1146 brw_inst *
1147 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1148 struct brw_reg src0, struct brw_reg src1)
1149 {
1150 /* 6.32.38: mul */
1151 if (src0.type == BRW_REGISTER_TYPE_D ||
1152 src0.type == BRW_REGISTER_TYPE_UD ||
1153 src1.type == BRW_REGISTER_TYPE_D ||
1154 src1.type == BRW_REGISTER_TYPE_UD) {
1155 assert(dest.type != BRW_REGISTER_TYPE_F);
1156 }
1157
1158 if (src0.type == BRW_REGISTER_TYPE_F ||
1159 (src0.file == BRW_IMMEDIATE_VALUE &&
1160 src0.type == BRW_REGISTER_TYPE_VF)) {
1161 assert(src1.type != BRW_REGISTER_TYPE_UD);
1162 assert(src1.type != BRW_REGISTER_TYPE_D);
1163 }
1164
1165 if (src1.type == BRW_REGISTER_TYPE_F ||
1166 (src1.file == BRW_IMMEDIATE_VALUE &&
1167 src1.type == BRW_REGISTER_TYPE_VF)) {
1168 assert(src0.type != BRW_REGISTER_TYPE_UD);
1169 assert(src0.type != BRW_REGISTER_TYPE_D);
1170 }
1171
1172 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1173 src0.nr != BRW_ARF_ACCUMULATOR);
1174 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1175 src1.nr != BRW_ARF_ACCUMULATOR);
1176
1177 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1178 }
1179
1180 brw_inst *
1181 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1182 struct brw_reg src0, struct brw_reg src1)
1183 {
1184 src0.vstride = BRW_VERTICAL_STRIDE_0;
1185 src0.width = BRW_WIDTH_1;
1186 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1187 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1188 }
1189
1190 brw_inst *
1191 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1192 struct brw_reg src0, struct brw_reg src1)
1193 {
1194 src0.vstride = BRW_VERTICAL_STRIDE_0;
1195 src0.width = BRW_WIDTH_1;
1196 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1197 src1.vstride = BRW_VERTICAL_STRIDE_8;
1198 src1.width = BRW_WIDTH_8;
1199 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1200 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1201 }
1202
1203 brw_inst *
1204 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1205 {
1206 const struct gen_device_info *devinfo = p->devinfo;
1207 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1208 /* The F32TO16 instruction doesn't support 32-bit destination types in
1209 * Align1 mode, and neither does the Gen8 implementation in terms of a
1210 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1211 * an undocumented feature.
1212 */
1213 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1214 (!align16 || devinfo->gen >= 8));
1215 brw_inst *inst;
1216
1217 if (align16) {
1218 assert(dst.type == BRW_REGISTER_TYPE_UD);
1219 } else {
1220 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1221 dst.type == BRW_REGISTER_TYPE_W ||
1222 dst.type == BRW_REGISTER_TYPE_UW ||
1223 dst.type == BRW_REGISTER_TYPE_HF);
1224 }
1225
1226 brw_push_insn_state(p);
1227
1228 if (needs_zero_fill) {
1229 brw_set_default_access_mode(p, BRW_ALIGN_1);
1230 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1231 }
1232
1233 if (devinfo->gen >= 8) {
1234 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1235 } else {
1236 assert(devinfo->gen == 7);
1237 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1238 }
1239
1240 if (needs_zero_fill) {
1241 brw_inst_set_no_dd_clear(devinfo, inst, true);
1242 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1243 brw_inst_set_no_dd_check(devinfo, inst, true);
1244 }
1245
1246 brw_pop_insn_state(p);
1247 return inst;
1248 }
1249
1250 brw_inst *
1251 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1252 {
1253 const struct gen_device_info *devinfo = p->devinfo;
1254 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1255
1256 if (align16) {
1257 assert(src.type == BRW_REGISTER_TYPE_UD);
1258 } else {
1259 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1260 *
1261 * Because this instruction does not have a 16-bit floating-point
1262 * type, the source data type must be Word (W). The destination type
1263 * must be F (Float).
1264 */
1265 if (src.type == BRW_REGISTER_TYPE_UD)
1266 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1267
1268 assert(src.type == BRW_REGISTER_TYPE_W ||
1269 src.type == BRW_REGISTER_TYPE_UW ||
1270 src.type == BRW_REGISTER_TYPE_HF);
1271 }
1272
1273 if (devinfo->gen >= 8) {
1274 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1275 } else {
1276 assert(devinfo->gen == 7);
1277 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1278 }
1279 }
1280
1281
1282 void brw_NOP(struct brw_codegen *p)
1283 {
1284 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1285 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_1);
1286 brw_set_dest(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1287 brw_set_src0(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1288 brw_set_src1(p, insn, brw_imm_ud(0x0));
1289 }
1290
1291
1292
1293
1294
1295 /***********************************************************************
1296 * Comparisons, if/else/endif
1297 */
1298
1299 brw_inst *
1300 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1301 unsigned predicate_control)
1302 {
1303 const struct gen_device_info *devinfo = p->devinfo;
1304 struct brw_reg ip = brw_ip_reg();
1305 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1306
1307 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1308 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1309 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1310 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1311
1312 return inst;
1313 }
1314
1315 static void
1316 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1317 {
1318 p->if_stack[p->if_stack_depth] = inst - p->store;
1319
1320 p->if_stack_depth++;
1321 if (p->if_stack_array_size <= p->if_stack_depth) {
1322 p->if_stack_array_size *= 2;
1323 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1324 p->if_stack_array_size);
1325 }
1326 }
1327
1328 static brw_inst *
1329 pop_if_stack(struct brw_codegen *p)
1330 {
1331 p->if_stack_depth--;
1332 return &p->store[p->if_stack[p->if_stack_depth]];
1333 }
1334
1335 static void
1336 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1337 {
1338 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1339 p->loop_stack_array_size *= 2;
1340 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1341 p->loop_stack_array_size);
1342 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1343 p->loop_stack_array_size);
1344 }
1345
1346 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1347 p->loop_stack_depth++;
1348 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1349 }
1350
1351 static brw_inst *
1352 get_inner_do_insn(struct brw_codegen *p)
1353 {
1354 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1355 }
1356
1357 /* EU takes the value from the flag register and pushes it onto some
1358 * sort of a stack (presumably merging with any flag value already on
1359 * the stack). Within an if block, the flags at the top of the stack
1360 * control execution on each channel of the unit, eg. on each of the
1361 * 16 pixel values in our wm programs.
1362 *
1363 * When the matching 'else' instruction is reached (presumably by
1364 * countdown of the instruction count patched in by our ELSE/ENDIF
1365 * functions), the relevant flags are inverted.
1366 *
1367 * When the matching 'endif' instruction is reached, the flags are
1368 * popped off. If the stack is now empty, normal execution resumes.
1369 */
1370 brw_inst *
1371 brw_IF(struct brw_codegen *p, unsigned execute_size)
1372 {
1373 const struct gen_device_info *devinfo = p->devinfo;
1374 brw_inst *insn;
1375
1376 insn = next_insn(p, BRW_OPCODE_IF);
1377
1378 /* Override the defaults for this instruction:
1379 */
1380 if (devinfo->gen < 6) {
1381 brw_set_dest(p, insn, brw_ip_reg());
1382 brw_set_src0(p, insn, brw_ip_reg());
1383 brw_set_src1(p, insn, brw_imm_d(0x0));
1384 } else if (devinfo->gen == 6) {
1385 brw_set_dest(p, insn, brw_imm_w(0));
1386 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1387 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1388 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1389 } else if (devinfo->gen == 7) {
1390 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1391 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1392 brw_set_src1(p, insn, brw_imm_w(0));
1393 brw_inst_set_jip(devinfo, insn, 0);
1394 brw_inst_set_uip(devinfo, insn, 0);
1395 } else {
1396 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1397 brw_set_src0(p, insn, brw_imm_d(0));
1398 brw_inst_set_jip(devinfo, insn, 0);
1399 brw_inst_set_uip(devinfo, insn, 0);
1400 }
1401
1402 brw_inst_set_exec_size(devinfo, insn, execute_size);
1403 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1404 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1405 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1406 if (!p->single_program_flow && devinfo->gen < 6)
1407 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1408
1409 push_if_stack(p, insn);
1410 p->if_depth_in_loop[p->loop_stack_depth]++;
1411 return insn;
1412 }
1413
1414 /* This function is only used for gen6-style IF instructions with an
1415 * embedded comparison (conditional modifier). It is not used on gen7.
1416 */
1417 brw_inst *
1418 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1419 struct brw_reg src0, struct brw_reg src1)
1420 {
1421 const struct gen_device_info *devinfo = p->devinfo;
1422 brw_inst *insn;
1423
1424 insn = next_insn(p, BRW_OPCODE_IF);
1425
1426 brw_set_dest(p, insn, brw_imm_w(0));
1427 brw_inst_set_exec_size(devinfo, insn,
1428 brw_inst_exec_size(devinfo, p->current));
1429 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1430 brw_set_src0(p, insn, src0);
1431 brw_set_src1(p, insn, src1);
1432
1433 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1434 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1435 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1436
1437 push_if_stack(p, insn);
1438 return insn;
1439 }
1440
1441 /**
1442 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1443 */
1444 static void
1445 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1446 brw_inst *if_inst, brw_inst *else_inst)
1447 {
1448 const struct gen_device_info *devinfo = p->devinfo;
1449
1450 /* The next instruction (where the ENDIF would be, if it existed) */
1451 brw_inst *next_inst = &p->store[p->nr_insn];
1452
1453 assert(p->single_program_flow);
1454 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1455 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1456 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1457
1458 /* Convert IF to an ADD instruction that moves the instruction pointer
1459 * to the first instruction of the ELSE block. If there is no ELSE
1460 * block, point to where ENDIF would be. Reverse the predicate.
1461 *
1462 * There's no need to execute an ENDIF since we don't need to do any
1463 * stack operations, and if we're currently executing, we just want to
1464 * continue normally.
1465 */
1466 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1467 brw_inst_set_pred_inv(devinfo, if_inst, true);
1468
1469 if (else_inst != NULL) {
1470 /* Convert ELSE to an ADD instruction that points where the ENDIF
1471 * would be.
1472 */
1473 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1474
1475 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1476 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1477 } else {
1478 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1479 }
1480 }
1481
1482 /**
1483 * Patch IF and ELSE instructions with appropriate jump targets.
1484 */
1485 static void
1486 patch_IF_ELSE(struct brw_codegen *p,
1487 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1488 {
1489 const struct gen_device_info *devinfo = p->devinfo;
1490
1491 /* We shouldn't be patching IF and ELSE instructions in single program flow
1492 * mode when gen < 6, because in single program flow mode on those
1493 * platforms, we convert flow control instructions to conditional ADDs that
1494 * operate on IP (see brw_ENDIF).
1495 *
1496 * However, on Gen6, writing to IP doesn't work in single program flow mode
1497 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1498 * not be updated by non-flow control instructions."). And on later
1499 * platforms, there is no significant benefit to converting control flow
1500 * instructions to conditional ADDs. So we do patch IF and ELSE
1501 * instructions in single program flow mode on those platforms.
1502 */
1503 if (devinfo->gen < 6)
1504 assert(!p->single_program_flow);
1505
1506 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1507 assert(endif_inst != NULL);
1508 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1509
1510 unsigned br = brw_jump_scale(devinfo);
1511
1512 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1513 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1514
1515 if (else_inst == NULL) {
1516 /* Patch IF -> ENDIF */
1517 if (devinfo->gen < 6) {
1518 /* Turn it into an IFF, which means no mask stack operations for
1519 * all-false and jumping past the ENDIF.
1520 */
1521 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1522 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1523 br * (endif_inst - if_inst + 1));
1524 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1525 } else if (devinfo->gen == 6) {
1526 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1527 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1528 } else {
1529 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1530 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1531 }
1532 } else {
1533 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1534
1535 /* Patch IF -> ELSE */
1536 if (devinfo->gen < 6) {
1537 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1538 br * (else_inst - if_inst));
1539 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1540 } else if (devinfo->gen == 6) {
1541 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1542 br * (else_inst - if_inst + 1));
1543 }
1544
1545 /* Patch ELSE -> ENDIF */
1546 if (devinfo->gen < 6) {
1547 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1548 * matching ENDIF.
1549 */
1550 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1551 br * (endif_inst - else_inst + 1));
1552 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1553 } else if (devinfo->gen == 6) {
1554 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1555 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1556 br * (endif_inst - else_inst));
1557 } else {
1558 /* The IF instruction's JIP should point just past the ELSE */
1559 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1560 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1561 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1562 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1563 if (devinfo->gen >= 8) {
1564 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1565 * should point to ENDIF.
1566 */
1567 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1568 }
1569 }
1570 }
1571 }
1572
1573 void
1574 brw_ELSE(struct brw_codegen *p)
1575 {
1576 const struct gen_device_info *devinfo = p->devinfo;
1577 brw_inst *insn;
1578
1579 insn = next_insn(p, BRW_OPCODE_ELSE);
1580
1581 if (devinfo->gen < 6) {
1582 brw_set_dest(p, insn, brw_ip_reg());
1583 brw_set_src0(p, insn, brw_ip_reg());
1584 brw_set_src1(p, insn, brw_imm_d(0x0));
1585 } else if (devinfo->gen == 6) {
1586 brw_set_dest(p, insn, brw_imm_w(0));
1587 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1588 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1590 } else if (devinfo->gen == 7) {
1591 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1592 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1593 brw_set_src1(p, insn, brw_imm_w(0));
1594 brw_inst_set_jip(devinfo, insn, 0);
1595 brw_inst_set_uip(devinfo, insn, 0);
1596 } else {
1597 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1598 brw_set_src0(p, insn, brw_imm_d(0));
1599 brw_inst_set_jip(devinfo, insn, 0);
1600 brw_inst_set_uip(devinfo, insn, 0);
1601 }
1602
1603 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1604 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1605 if (!p->single_program_flow && devinfo->gen < 6)
1606 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1607
1608 push_if_stack(p, insn);
1609 }
1610
1611 void
1612 brw_ENDIF(struct brw_codegen *p)
1613 {
1614 const struct gen_device_info *devinfo = p->devinfo;
1615 brw_inst *insn = NULL;
1616 brw_inst *else_inst = NULL;
1617 brw_inst *if_inst = NULL;
1618 brw_inst *tmp;
1619 bool emit_endif = true;
1620
1621 /* In single program flow mode, we can express IF and ELSE instructions
1622 * equivalently as ADD instructions that operate on IP. On platforms prior
1623 * to Gen6, flow control instructions cause an implied thread switch, so
1624 * this is a significant savings.
1625 *
1626 * However, on Gen6, writing to IP doesn't work in single program flow mode
1627 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1628 * not be updated by non-flow control instructions."). And on later
1629 * platforms, there is no significant benefit to converting control flow
1630 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1631 * Gen5.
1632 */
1633 if (devinfo->gen < 6 && p->single_program_flow)
1634 emit_endif = false;
1635
1636 /*
1637 * A single next_insn() may change the base address of instruction store
1638 * memory(p->store), so call it first before referencing the instruction
1639 * store pointer from an index
1640 */
1641 if (emit_endif)
1642 insn = next_insn(p, BRW_OPCODE_ENDIF);
1643
1644 /* Pop the IF and (optional) ELSE instructions from the stack */
1645 p->if_depth_in_loop[p->loop_stack_depth]--;
1646 tmp = pop_if_stack(p);
1647 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1648 else_inst = tmp;
1649 tmp = pop_if_stack(p);
1650 }
1651 if_inst = tmp;
1652
1653 if (!emit_endif) {
1654 /* ENDIF is useless; don't bother emitting it. */
1655 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1656 return;
1657 }
1658
1659 if (devinfo->gen < 6) {
1660 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1661 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1662 brw_set_src1(p, insn, brw_imm_d(0x0));
1663 } else if (devinfo->gen == 6) {
1664 brw_set_dest(p, insn, brw_imm_w(0));
1665 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1666 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1667 } else if (devinfo->gen == 7) {
1668 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1669 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1670 brw_set_src1(p, insn, brw_imm_w(0));
1671 } else {
1672 brw_set_src0(p, insn, brw_imm_d(0));
1673 }
1674
1675 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1676 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1677 if (devinfo->gen < 6)
1678 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1679
1680 /* Also pop item off the stack in the endif instruction: */
1681 if (devinfo->gen < 6) {
1682 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1683 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1684 } else if (devinfo->gen == 6) {
1685 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1686 } else {
1687 brw_inst_set_jip(devinfo, insn, 2);
1688 }
1689 patch_IF_ELSE(p, if_inst, else_inst, insn);
1690 }
1691
1692 brw_inst *
1693 brw_BREAK(struct brw_codegen *p)
1694 {
1695 const struct gen_device_info *devinfo = p->devinfo;
1696 brw_inst *insn;
1697
1698 insn = next_insn(p, BRW_OPCODE_BREAK);
1699 if (devinfo->gen >= 8) {
1700 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1701 brw_set_src0(p, insn, brw_imm_d(0x0));
1702 } else if (devinfo->gen >= 6) {
1703 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1704 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1705 brw_set_src1(p, insn, brw_imm_d(0x0));
1706 } else {
1707 brw_set_dest(p, insn, brw_ip_reg());
1708 brw_set_src0(p, insn, brw_ip_reg());
1709 brw_set_src1(p, insn, brw_imm_d(0x0));
1710 brw_inst_set_gen4_pop_count(devinfo, insn,
1711 p->if_depth_in_loop[p->loop_stack_depth]);
1712 }
1713 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1714 brw_inst_set_exec_size(devinfo, insn,
1715 brw_inst_exec_size(devinfo, p->current));
1716
1717 return insn;
1718 }
1719
1720 brw_inst *
1721 brw_CONT(struct brw_codegen *p)
1722 {
1723 const struct gen_device_info *devinfo = p->devinfo;
1724 brw_inst *insn;
1725
1726 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1727 brw_set_dest(p, insn, brw_ip_reg());
1728 if (devinfo->gen >= 8) {
1729 brw_set_src0(p, insn, brw_imm_d(0x0));
1730 } else {
1731 brw_set_src0(p, insn, brw_ip_reg());
1732 brw_set_src1(p, insn, brw_imm_d(0x0));
1733 }
1734
1735 if (devinfo->gen < 6) {
1736 brw_inst_set_gen4_pop_count(devinfo, insn,
1737 p->if_depth_in_loop[p->loop_stack_depth]);
1738 }
1739 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1740 brw_inst_set_exec_size(devinfo, insn,
1741 brw_inst_exec_size(devinfo, p->current));
1742 return insn;
1743 }
1744
1745 brw_inst *
1746 gen6_HALT(struct brw_codegen *p)
1747 {
1748 const struct gen_device_info *devinfo = p->devinfo;
1749 brw_inst *insn;
1750
1751 insn = next_insn(p, BRW_OPCODE_HALT);
1752 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1753 if (devinfo->gen >= 8) {
1754 brw_set_src0(p, insn, brw_imm_d(0x0));
1755 } else {
1756 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1757 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1758 }
1759
1760 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1761 brw_inst_set_exec_size(devinfo, insn,
1762 brw_inst_exec_size(devinfo, p->current));
1763 return insn;
1764 }
1765
1766 /* DO/WHILE loop:
1767 *
1768 * The DO/WHILE is just an unterminated loop -- break or continue are
1769 * used for control within the loop. We have a few ways they can be
1770 * done.
1771 *
1772 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1773 * jip and no DO instruction.
1774 *
1775 * For non-uniform control flow pre-gen6, there's a DO instruction to
1776 * push the mask, and a WHILE to jump back, and BREAK to get out and
1777 * pop the mask.
1778 *
1779 * For gen6, there's no more mask stack, so no need for DO. WHILE
1780 * just points back to the first instruction of the loop.
1781 */
1782 brw_inst *
1783 brw_DO(struct brw_codegen *p, unsigned execute_size)
1784 {
1785 const struct gen_device_info *devinfo = p->devinfo;
1786
1787 if (devinfo->gen >= 6 || p->single_program_flow) {
1788 push_loop_stack(p, &p->store[p->nr_insn]);
1789 return &p->store[p->nr_insn];
1790 } else {
1791 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1792
1793 push_loop_stack(p, insn);
1794
1795 /* Override the defaults for this instruction:
1796 */
1797 brw_set_dest(p, insn, brw_null_reg());
1798 brw_set_src0(p, insn, brw_null_reg());
1799 brw_set_src1(p, insn, brw_null_reg());
1800
1801 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1802 brw_inst_set_exec_size(devinfo, insn, execute_size);
1803 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1804
1805 return insn;
1806 }
1807 }
1808
1809 /**
1810 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1811 * instruction here.
1812 *
1813 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1814 * nesting, since it can always just point to the end of the block/current loop.
1815 */
1816 static void
1817 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1818 {
1819 const struct gen_device_info *devinfo = p->devinfo;
1820 brw_inst *do_inst = get_inner_do_insn(p);
1821 brw_inst *inst;
1822 unsigned br = brw_jump_scale(devinfo);
1823
1824 assert(devinfo->gen < 6);
1825
1826 for (inst = while_inst - 1; inst != do_inst; inst--) {
1827 /* If the jump count is != 0, that means that this instruction has already
1828 * been patched because it's part of a loop inside of the one we're
1829 * patching.
1830 */
1831 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1832 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1833 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1834 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1835 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1836 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1837 }
1838 }
1839 }
1840
1841 brw_inst *
1842 brw_WHILE(struct brw_codegen *p)
1843 {
1844 const struct gen_device_info *devinfo = p->devinfo;
1845 brw_inst *insn, *do_insn;
1846 unsigned br = brw_jump_scale(devinfo);
1847
1848 if (devinfo->gen >= 6) {
1849 insn = next_insn(p, BRW_OPCODE_WHILE);
1850 do_insn = get_inner_do_insn(p);
1851
1852 if (devinfo->gen >= 8) {
1853 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1854 brw_set_src0(p, insn, brw_imm_d(0));
1855 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1856 } else if (devinfo->gen == 7) {
1857 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1858 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1859 brw_set_src1(p, insn, brw_imm_w(0));
1860 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1861 } else {
1862 brw_set_dest(p, insn, brw_imm_w(0));
1863 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1864 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1865 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1866 }
1867
1868 brw_inst_set_exec_size(devinfo, insn,
1869 brw_inst_exec_size(devinfo, p->current));
1870
1871 } else {
1872 if (p->single_program_flow) {
1873 insn = next_insn(p, BRW_OPCODE_ADD);
1874 do_insn = get_inner_do_insn(p);
1875
1876 brw_set_dest(p, insn, brw_ip_reg());
1877 brw_set_src0(p, insn, brw_ip_reg());
1878 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1879 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1880 } else {
1881 insn = next_insn(p, BRW_OPCODE_WHILE);
1882 do_insn = get_inner_do_insn(p);
1883
1884 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1885
1886 brw_set_dest(p, insn, brw_ip_reg());
1887 brw_set_src0(p, insn, brw_ip_reg());
1888 brw_set_src1(p, insn, brw_imm_d(0));
1889
1890 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1891 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1892 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1893
1894 brw_patch_break_cont(p, insn);
1895 }
1896 }
1897 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1898
1899 p->loop_stack_depth--;
1900
1901 return insn;
1902 }
1903
1904 /* FORWARD JUMPS:
1905 */
1906 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1907 {
1908 const struct gen_device_info *devinfo = p->devinfo;
1909 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1910 unsigned jmpi = 1;
1911
1912 if (devinfo->gen >= 5)
1913 jmpi = 2;
1914
1915 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1916 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1917
1918 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1919 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1920 }
1921
1922 /* To integrate with the above, it makes sense that the comparison
1923 * instruction should populate the flag register. It might be simpler
1924 * just to use the flag reg for most WM tasks?
1925 */
1926 void brw_CMP(struct brw_codegen *p,
1927 struct brw_reg dest,
1928 unsigned conditional,
1929 struct brw_reg src0,
1930 struct brw_reg src1)
1931 {
1932 const struct gen_device_info *devinfo = p->devinfo;
1933 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1934
1935 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1936 brw_set_dest(p, insn, dest);
1937 brw_set_src0(p, insn, src0);
1938 brw_set_src1(p, insn, src1);
1939
1940 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1941 * page says:
1942 * "Any CMP instruction with a null destination must use a {switch}."
1943 *
1944 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1945 * mentioned on their work-arounds pages.
1946 */
1947 if (devinfo->gen == 7) {
1948 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1949 dest.nr == BRW_ARF_NULL) {
1950 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1951 }
1952 }
1953 }
1954
1955 /***********************************************************************
1956 * Helpers for the various SEND message types:
1957 */
1958
1959 /** Extended math function, float[8].
1960 */
1961 void gen4_math(struct brw_codegen *p,
1962 struct brw_reg dest,
1963 unsigned function,
1964 unsigned msg_reg_nr,
1965 struct brw_reg src,
1966 unsigned precision )
1967 {
1968 const struct gen_device_info *devinfo = p->devinfo;
1969 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1970 unsigned data_type;
1971 if (has_scalar_region(src)) {
1972 data_type = BRW_MATH_DATA_SCALAR;
1973 } else {
1974 data_type = BRW_MATH_DATA_VECTOR;
1975 }
1976
1977 assert(devinfo->gen < 6);
1978
1979 /* Example code doesn't set predicate_control for send
1980 * instructions.
1981 */
1982 brw_inst_set_pred_control(devinfo, insn, 0);
1983 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1984
1985 brw_set_dest(p, insn, dest);
1986 brw_set_src0(p, insn, src);
1987 brw_set_math_message(p,
1988 insn,
1989 function,
1990 src.type == BRW_REGISTER_TYPE_D,
1991 precision,
1992 data_type);
1993 }
1994
1995 void gen6_math(struct brw_codegen *p,
1996 struct brw_reg dest,
1997 unsigned function,
1998 struct brw_reg src0,
1999 struct brw_reg src1)
2000 {
2001 const struct gen_device_info *devinfo = p->devinfo;
2002 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2003
2004 assert(devinfo->gen >= 6);
2005
2006 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2007 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2008 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
2009 (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
2010
2011 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2012 if (devinfo->gen == 6) {
2013 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2014 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2015 }
2016
2017 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2018 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2019 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2020 assert(src0.type != BRW_REGISTER_TYPE_F);
2021 assert(src1.type != BRW_REGISTER_TYPE_F);
2022 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2023 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2024 } else {
2025 assert(src0.type == BRW_REGISTER_TYPE_F);
2026 assert(src1.type == BRW_REGISTER_TYPE_F);
2027 if (function == BRW_MATH_FUNCTION_POW) {
2028 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2029 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2030 } else {
2031 assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2032 src1.nr == BRW_ARF_NULL);
2033 }
2034 }
2035
2036 /* Source modifiers are ignored for extended math instructions on Gen6. */
2037 if (devinfo->gen == 6) {
2038 assert(!src0.negate);
2039 assert(!src0.abs);
2040 assert(!src1.negate);
2041 assert(!src1.abs);
2042 }
2043
2044 brw_inst_set_math_function(devinfo, insn, function);
2045
2046 brw_set_dest(p, insn, dest);
2047 brw_set_src0(p, insn, src0);
2048 brw_set_src1(p, insn, src1);
2049 }
2050
2051 /**
2052 * Return the right surface index to access the thread scratch space using
2053 * stateless dataport messages.
2054 */
2055 unsigned
2056 brw_scratch_surface_idx(const struct brw_codegen *p)
2057 {
2058 /* The scratch space is thread-local so IA coherency is unnecessary. */
2059 if (p->devinfo->gen >= 8)
2060 return GEN8_BTI_STATELESS_NON_COHERENT;
2061 else
2062 return BRW_BTI_STATELESS;
2063 }
2064
2065 /**
2066 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2067 * using a constant offset per channel.
2068 *
2069 * The offset must be aligned to oword size (16 bytes). Used for
2070 * register spilling.
2071 */
2072 void brw_oword_block_write_scratch(struct brw_codegen *p,
2073 struct brw_reg mrf,
2074 int num_regs,
2075 unsigned offset)
2076 {
2077 const struct gen_device_info *devinfo = p->devinfo;
2078 uint32_t msg_type;
2079
2080 if (devinfo->gen >= 6)
2081 offset /= 16;
2082
2083 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2084
2085 const unsigned mlen = 1 + num_regs;
2086 const unsigned msg_control =
2087 (num_regs == 1 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :
2088 num_regs == 2 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :
2089 num_regs == 4 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : 0);
2090 assert(msg_control);
2091
2092 /* Set up the message header. This is g0, with g0.2 filled with
2093 * the offset. We don't want to leave our offset around in g0 or
2094 * it'll screw up texture samples, so set it up inside the message
2095 * reg.
2096 */
2097 {
2098 brw_push_insn_state(p);
2099 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2100 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2101 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2102
2103 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2104
2105 /* set message header global offset field (reg 0, element 2) */
2106 brw_MOV(p,
2107 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2108 mrf.nr,
2109 2), BRW_REGISTER_TYPE_UD),
2110 brw_imm_ud(offset));
2111
2112 brw_pop_insn_state(p);
2113 }
2114
2115 {
2116 struct brw_reg dest;
2117 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2118 int send_commit_msg;
2119 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2120 BRW_REGISTER_TYPE_UW);
2121
2122 brw_inst_set_compression(devinfo, insn, false);
2123
2124 if (brw_inst_exec_size(devinfo, insn) >= 16)
2125 src_header = vec16(src_header);
2126
2127 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2128 if (devinfo->gen < 6)
2129 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2130
2131 /* Until gen6, writes followed by reads from the same location
2132 * are not guaranteed to be ordered unless write_commit is set.
2133 * If set, then a no-op write is issued to the destination
2134 * register to set a dependency, and a read from the destination
2135 * can be used to ensure the ordering.
2136 *
2137 * For gen6, only writes between different threads need ordering
2138 * protection. Our use of DP writes is all about register
2139 * spilling within a thread.
2140 */
2141 if (devinfo->gen >= 6) {
2142 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2143 send_commit_msg = 0;
2144 } else {
2145 dest = src_header;
2146 send_commit_msg = 1;
2147 }
2148
2149 brw_set_dest(p, insn, dest);
2150 if (devinfo->gen >= 6) {
2151 brw_set_src0(p, insn, mrf);
2152 } else {
2153 brw_set_src0(p, insn, brw_null_reg());
2154 }
2155
2156 if (devinfo->gen >= 6)
2157 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2158 else
2159 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2160
2161 brw_set_dp_write_message(p,
2162 insn,
2163 brw_scratch_surface_idx(p),
2164 msg_control,
2165 msg_type,
2166 mlen,
2167 true, /* header_present */
2168 0, /* not a render target */
2169 send_commit_msg, /* response_length */
2170 0, /* eot */
2171 send_commit_msg);
2172 }
2173 }
2174
2175
2176 /**
2177 * Read a block of owords (half a GRF each) from the scratch buffer
2178 * using a constant index per channel.
2179 *
2180 * Offset must be aligned to oword size (16 bytes). Used for register
2181 * spilling.
2182 */
2183 void
2184 brw_oword_block_read_scratch(struct brw_codegen *p,
2185 struct brw_reg dest,
2186 struct brw_reg mrf,
2187 int num_regs,
2188 unsigned offset)
2189 {
2190 const struct gen_device_info *devinfo = p->devinfo;
2191
2192 if (devinfo->gen >= 6)
2193 offset /= 16;
2194
2195 if (p->devinfo->gen >= 7) {
2196 /* On gen 7 and above, we no longer have message registers and we can
2197 * send from any register we want. By using the destination register
2198 * for the message, we guarantee that the implied message write won't
2199 * accidentally overwrite anything. This has been a problem because
2200 * the MRF registers and source for the final FB write are both fixed
2201 * and may overlap.
2202 */
2203 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2204 } else {
2205 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2206 }
2207 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2208
2209 const unsigned rlen = num_regs;
2210 const unsigned msg_control =
2211 (num_regs == 1 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :
2212 num_regs == 2 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :
2213 num_regs == 4 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : 0);
2214 assert(msg_control);
2215 const unsigned target_cache = devinfo->gen >= 7 ?
2216 BRW_DATAPORT_READ_TARGET_DATA_CACHE :
2217 BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
2218
2219 {
2220 brw_push_insn_state(p);
2221 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2222 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2223 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2224
2225 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2226
2227 /* set message header global offset field (reg 0, element 2) */
2228 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2229
2230 brw_pop_insn_state(p);
2231 }
2232
2233 {
2234 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2235
2236 assert(brw_inst_pred_control(devinfo, insn) == 0);
2237 brw_inst_set_compression(devinfo, insn, false);
2238
2239 brw_set_dest(p, insn, dest); /* UW? */
2240 if (devinfo->gen >= 6) {
2241 brw_set_src0(p, insn, mrf);
2242 } else {
2243 brw_set_src0(p, insn, brw_null_reg());
2244 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2245 }
2246
2247 brw_set_dp_read_message(p,
2248 insn,
2249 brw_scratch_surface_idx(p),
2250 msg_control,
2251 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2252 target_cache,
2253 1, /* msg_length */
2254 true, /* header_present */
2255 rlen);
2256 }
2257 }
2258
2259 void
2260 gen7_block_read_scratch(struct brw_codegen *p,
2261 struct brw_reg dest,
2262 int num_regs,
2263 unsigned offset)
2264 {
2265 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2266 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2267
2268 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2269
2270 /* The HW requires that the header is present; this is to get the g0.5
2271 * scratch offset.
2272 */
2273 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2274
2275 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2276 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2277 * is 32 bytes, which happens to be the size of a register.
2278 */
2279 offset /= REG_SIZE;
2280 assert(offset < (1 << 12));
2281
2282 gen7_set_dp_scratch_message(p, insn,
2283 false, /* scratch read */
2284 false, /* OWords */
2285 false, /* invalidate after read */
2286 num_regs,
2287 offset,
2288 1, /* mlen: just g0 */
2289 num_regs, /* rlen */
2290 true); /* header present */
2291 }
2292
2293 /**
2294 * Read a float[4] vector from the data port Data Cache (const buffer).
2295 * Location (in buffer) should be a multiple of 16.
2296 * Used for fetching shader constants.
2297 */
2298 void brw_oword_block_read(struct brw_codegen *p,
2299 struct brw_reg dest,
2300 struct brw_reg mrf,
2301 uint32_t offset,
2302 uint32_t bind_table_index)
2303 {
2304 const struct gen_device_info *devinfo = p->devinfo;
2305
2306 /* On newer hardware, offset is in units of owords. */
2307 if (devinfo->gen >= 6)
2308 offset /= 16;
2309
2310 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2311
2312 brw_push_insn_state(p);
2313 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2314 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2315 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2316 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2317
2318 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2319
2320 /* set message header global offset field (reg 0, element 2) */
2321 brw_MOV(p,
2322 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2323 mrf.nr,
2324 2), BRW_REGISTER_TYPE_UD),
2325 brw_imm_ud(offset));
2326
2327 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2328
2329 /* cast dest to a uword[8] vector */
2330 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2331
2332 brw_set_dest(p, insn, dest);
2333 if (devinfo->gen >= 6) {
2334 brw_set_src0(p, insn, mrf);
2335 } else {
2336 brw_set_src0(p, insn, brw_null_reg());
2337 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2338 }
2339
2340 brw_set_dp_read_message(p,
2341 insn,
2342 bind_table_index,
2343 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2344 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2345 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2346 1, /* msg_length */
2347 true, /* header_present */
2348 1); /* response_length (1 reg, 2 owords!) */
2349
2350 brw_pop_insn_state(p);
2351 }
2352
2353
2354 void brw_fb_WRITE(struct brw_codegen *p,
2355 struct brw_reg payload,
2356 struct brw_reg implied_header,
2357 unsigned msg_control,
2358 unsigned binding_table_index,
2359 unsigned msg_length,
2360 unsigned response_length,
2361 bool eot,
2362 bool last_render_target,
2363 bool header_present)
2364 {
2365 const struct gen_device_info *devinfo = p->devinfo;
2366 brw_inst *insn;
2367 unsigned msg_type;
2368 struct brw_reg dest, src0;
2369
2370 if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2371 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2372 else
2373 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2374
2375 if (devinfo->gen >= 6) {
2376 insn = next_insn(p, BRW_OPCODE_SENDC);
2377 } else {
2378 insn = next_insn(p, BRW_OPCODE_SEND);
2379 }
2380 brw_inst_set_compression(devinfo, insn, false);
2381
2382 if (devinfo->gen >= 6) {
2383 /* headerless version, just submit color payload */
2384 src0 = payload;
2385
2386 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2387 } else {
2388 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2389 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2390 src0 = implied_header;
2391
2392 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2393 }
2394
2395 brw_set_dest(p, insn, dest);
2396 brw_set_src0(p, insn, src0);
2397 brw_set_dp_write_message(p,
2398 insn,
2399 binding_table_index,
2400 msg_control,
2401 msg_type,
2402 msg_length,
2403 header_present,
2404 last_render_target,
2405 response_length,
2406 eot,
2407 0 /* send_commit_msg */);
2408 }
2409
2410 brw_inst *
2411 gen9_fb_READ(struct brw_codegen *p,
2412 struct brw_reg dst,
2413 struct brw_reg payload,
2414 unsigned binding_table_index,
2415 unsigned msg_length,
2416 unsigned response_length,
2417 bool per_sample)
2418 {
2419 const struct gen_device_info *devinfo = p->devinfo;
2420 assert(devinfo->gen >= 9);
2421 const unsigned msg_subtype =
2422 brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2423 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2424
2425 brw_set_dest(p, insn, dst);
2426 brw_set_src0(p, insn, payload);
2427 brw_set_dp_read_message(p, insn, binding_table_index,
2428 per_sample << 5 | msg_subtype,
2429 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2430 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2431 msg_length, true /* header_present */,
2432 response_length);
2433 brw_inst_set_rt_slot_group(devinfo, insn,
2434 brw_inst_qtr_control(devinfo, p->current) / 2);
2435
2436 return insn;
2437 }
2438
2439 /**
2440 * Texture sample instruction.
2441 * Note: the msg_type plus msg_length values determine exactly what kind
2442 * of sampling operation is performed. See volume 4, page 161 of docs.
2443 */
2444 void brw_SAMPLE(struct brw_codegen *p,
2445 struct brw_reg dest,
2446 unsigned msg_reg_nr,
2447 struct brw_reg src0,
2448 unsigned binding_table_index,
2449 unsigned sampler,
2450 unsigned msg_type,
2451 unsigned response_length,
2452 unsigned msg_length,
2453 unsigned header_present,
2454 unsigned simd_mode,
2455 unsigned return_format)
2456 {
2457 const struct gen_device_info *devinfo = p->devinfo;
2458 brw_inst *insn;
2459
2460 if (msg_reg_nr != -1)
2461 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2462
2463 insn = next_insn(p, BRW_OPCODE_SEND);
2464 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2465
2466 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2467 *
2468 * "Instruction compression is not allowed for this instruction (that
2469 * is, send). The hardware behavior is undefined if this instruction is
2470 * set as compressed. However, compress control can be set to "SecHalf"
2471 * to affect the EMask generation."
2472 *
2473 * No similar wording is found in later PRMs, but there are examples
2474 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2475 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2476 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2477 */
2478 brw_inst_set_compression(devinfo, insn, false);
2479
2480 if (devinfo->gen < 6)
2481 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2482
2483 brw_set_dest(p, insn, dest);
2484 brw_set_src0(p, insn, src0);
2485 brw_set_sampler_message(p, insn,
2486 binding_table_index,
2487 sampler,
2488 msg_type,
2489 response_length,
2490 msg_length,
2491 header_present,
2492 simd_mode,
2493 return_format);
2494 }
2495
2496 /* Adjust the message header's sampler state pointer to
2497 * select the correct group of 16 samplers.
2498 */
2499 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2500 struct brw_reg header,
2501 struct brw_reg sampler_index)
2502 {
2503 /* The "Sampler Index" field can only store values between 0 and 15.
2504 * However, we can add an offset to the "Sampler State Pointer"
2505 * field, effectively selecting a different set of 16 samplers.
2506 *
2507 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2508 * offset, and each sampler state is only 16-bytes, so we can't
2509 * exclusively use the offset - we have to use both.
2510 */
2511
2512 const struct gen_device_info *devinfo = p->devinfo;
2513
2514 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2515 const int sampler_state_size = 16; /* 16 bytes */
2516 uint32_t sampler = sampler_index.ud;
2517
2518 if (sampler >= 16) {
2519 assert(devinfo->is_haswell || devinfo->gen >= 8);
2520 brw_ADD(p,
2521 get_element_ud(header, 3),
2522 get_element_ud(brw_vec8_grf(0, 0), 3),
2523 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2524 }
2525 } else {
2526 /* Non-const sampler array indexing case */
2527 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2528 return;
2529 }
2530
2531 struct brw_reg temp = get_element_ud(header, 3);
2532
2533 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2534 brw_SHL(p, temp, temp, brw_imm_ud(4));
2535 brw_ADD(p,
2536 get_element_ud(header, 3),
2537 get_element_ud(brw_vec8_grf(0, 0), 3),
2538 temp);
2539 }
2540 }
2541
2542 /* All these variables are pretty confusing - we might be better off
2543 * using bitmasks and macros for this, in the old style. Or perhaps
2544 * just having the caller instantiate the fields in dword3 itself.
2545 */
2546 void brw_urb_WRITE(struct brw_codegen *p,
2547 struct brw_reg dest,
2548 unsigned msg_reg_nr,
2549 struct brw_reg src0,
2550 enum brw_urb_write_flags flags,
2551 unsigned msg_length,
2552 unsigned response_length,
2553 unsigned offset,
2554 unsigned swizzle)
2555 {
2556 const struct gen_device_info *devinfo = p->devinfo;
2557 brw_inst *insn;
2558
2559 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2560
2561 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2562 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2563 brw_push_insn_state(p);
2564 brw_set_default_access_mode(p, BRW_ALIGN_1);
2565 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2566 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2567 BRW_REGISTER_TYPE_UD),
2568 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2569 brw_imm_ud(0xff00));
2570 brw_pop_insn_state(p);
2571 }
2572
2573 insn = next_insn(p, BRW_OPCODE_SEND);
2574
2575 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2576
2577 brw_set_dest(p, insn, dest);
2578 brw_set_src0(p, insn, src0);
2579 brw_set_src1(p, insn, brw_imm_d(0));
2580
2581 if (devinfo->gen < 6)
2582 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2583
2584 brw_set_urb_message(p,
2585 insn,
2586 flags,
2587 msg_length,
2588 response_length,
2589 offset,
2590 swizzle);
2591 }
2592
2593 struct brw_inst *
2594 brw_send_indirect_message(struct brw_codegen *p,
2595 unsigned sfid,
2596 struct brw_reg dst,
2597 struct brw_reg payload,
2598 struct brw_reg desc)
2599 {
2600 const struct gen_device_info *devinfo = p->devinfo;
2601 struct brw_inst *send;
2602 int setup;
2603
2604 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2605
2606 assert(desc.type == BRW_REGISTER_TYPE_UD);
2607
2608 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2609 * in the indirect case) by its index in the instruction store. The
2610 * pointer returned by next_insn() may become invalid if emitting the SEND
2611 * in the indirect case reallocs the store.
2612 */
2613
2614 if (desc.file == BRW_IMMEDIATE_VALUE) {
2615 setup = p->nr_insn;
2616 send = next_insn(p, BRW_OPCODE_SEND);
2617 brw_set_src1(p, send, desc);
2618
2619 } else {
2620 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2621
2622 brw_push_insn_state(p);
2623 brw_set_default_access_mode(p, BRW_ALIGN_1);
2624 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2625 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2626
2627 /* Load the indirect descriptor to an address register using OR so the
2628 * caller can specify additional descriptor bits with the usual
2629 * brw_set_*_message() helper functions.
2630 */
2631 setup = p->nr_insn;
2632 brw_OR(p, addr, desc, brw_imm_ud(0));
2633
2634 brw_pop_insn_state(p);
2635
2636 send = next_insn(p, BRW_OPCODE_SEND);
2637 brw_set_src1(p, send, addr);
2638 }
2639
2640 if (dst.width < BRW_EXECUTE_8)
2641 brw_inst_set_exec_size(devinfo, send, dst.width);
2642
2643 brw_set_dest(p, send, dst);
2644 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2645 brw_inst_set_sfid(devinfo, send, sfid);
2646
2647 return &p->store[setup];
2648 }
2649
2650 static struct brw_inst *
2651 brw_send_indirect_surface_message(struct brw_codegen *p,
2652 unsigned sfid,
2653 struct brw_reg dst,
2654 struct brw_reg payload,
2655 struct brw_reg surface,
2656 unsigned message_len,
2657 unsigned response_len,
2658 bool header_present)
2659 {
2660 const struct gen_device_info *devinfo = p->devinfo;
2661 struct brw_inst *insn;
2662
2663 if (surface.file != BRW_IMMEDIATE_VALUE) {
2664 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2665
2666 brw_push_insn_state(p);
2667 brw_set_default_access_mode(p, BRW_ALIGN_1);
2668 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2669 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2670
2671 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2672 * some surface array is accessed out of bounds.
2673 */
2674 insn = brw_AND(p, addr,
2675 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2676 BRW_GET_SWZ(surface.swizzle, 0)),
2677 brw_imm_ud(0xff));
2678
2679 brw_pop_insn_state(p);
2680
2681 surface = addr;
2682 }
2683
2684 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2685 brw_inst_set_mlen(devinfo, insn, message_len);
2686 brw_inst_set_rlen(devinfo, insn, response_len);
2687 brw_inst_set_header_present(devinfo, insn, header_present);
2688
2689 return insn;
2690 }
2691
2692 static bool
2693 while_jumps_before_offset(const struct gen_device_info *devinfo,
2694 brw_inst *insn, int while_offset, int start_offset)
2695 {
2696 int scale = 16 / brw_jump_scale(devinfo);
2697 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2698 : brw_inst_jip(devinfo, insn);
2699 return while_offset + jip * scale <= start_offset;
2700 }
2701
2702
2703 static int
2704 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2705 {
2706 int offset;
2707 void *store = p->store;
2708 const struct gen_device_info *devinfo = p->devinfo;
2709
2710 int depth = 0;
2711
2712 for (offset = next_offset(devinfo, store, start_offset);
2713 offset < p->next_insn_offset;
2714 offset = next_offset(devinfo, store, offset)) {
2715 brw_inst *insn = store + offset;
2716
2717 switch (brw_inst_opcode(devinfo, insn)) {
2718 case BRW_OPCODE_IF:
2719 depth++;
2720 break;
2721 case BRW_OPCODE_ENDIF:
2722 if (depth == 0)
2723 return offset;
2724 depth--;
2725 break;
2726 case BRW_OPCODE_WHILE:
2727 /* If the while doesn't jump before our instruction, it's the end
2728 * of a sibling do...while loop. Ignore it.
2729 */
2730 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2731 continue;
2732 /* fallthrough */
2733 case BRW_OPCODE_ELSE:
2734 case BRW_OPCODE_HALT:
2735 if (depth == 0)
2736 return offset;
2737 }
2738 }
2739
2740 return 0;
2741 }
2742
2743 /* There is no DO instruction on gen6, so to find the end of the loop
2744 * we have to see if the loop is jumping back before our start
2745 * instruction.
2746 */
2747 static int
2748 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2749 {
2750 const struct gen_device_info *devinfo = p->devinfo;
2751 int offset;
2752 void *store = p->store;
2753
2754 assert(devinfo->gen >= 6);
2755
2756 /* Always start after the instruction (such as a WHILE) we're trying to fix
2757 * up.
2758 */
2759 for (offset = next_offset(devinfo, store, start_offset);
2760 offset < p->next_insn_offset;
2761 offset = next_offset(devinfo, store, offset)) {
2762 brw_inst *insn = store + offset;
2763
2764 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2765 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2766 return offset;
2767 }
2768 }
2769 assert(!"not reached");
2770 return start_offset;
2771 }
2772
2773 /* After program generation, go back and update the UIP and JIP of
2774 * BREAK, CONT, and HALT instructions to their correct locations.
2775 */
2776 void
2777 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2778 {
2779 const struct gen_device_info *devinfo = p->devinfo;
2780 int offset;
2781 int br = brw_jump_scale(devinfo);
2782 int scale = 16 / br;
2783 void *store = p->store;
2784
2785 if (devinfo->gen < 6)
2786 return;
2787
2788 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2789 brw_inst *insn = store + offset;
2790 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2791
2792 int block_end_offset = brw_find_next_block_end(p, offset);
2793 switch (brw_inst_opcode(devinfo, insn)) {
2794 case BRW_OPCODE_BREAK:
2795 assert(block_end_offset != 0);
2796 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2797 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2798 brw_inst_set_uip(devinfo, insn,
2799 (brw_find_loop_end(p, offset) - offset +
2800 (devinfo->gen == 6 ? 16 : 0)) / scale);
2801 break;
2802 case BRW_OPCODE_CONTINUE:
2803 assert(block_end_offset != 0);
2804 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2805 brw_inst_set_uip(devinfo, insn,
2806 (brw_find_loop_end(p, offset) - offset) / scale);
2807
2808 assert(brw_inst_uip(devinfo, insn) != 0);
2809 assert(brw_inst_jip(devinfo, insn) != 0);
2810 break;
2811
2812 case BRW_OPCODE_ENDIF: {
2813 int32_t jump = (block_end_offset == 0) ?
2814 1 * br : (block_end_offset - offset) / scale;
2815 if (devinfo->gen >= 7)
2816 brw_inst_set_jip(devinfo, insn, jump);
2817 else
2818 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2819 break;
2820 }
2821
2822 case BRW_OPCODE_HALT:
2823 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2824 *
2825 * "In case of the halt instruction not inside any conditional
2826 * code block, the value of <JIP> and <UIP> should be the
2827 * same. In case of the halt instruction inside conditional code
2828 * block, the <UIP> should be the end of the program, and the
2829 * <JIP> should be end of the most inner conditional code block."
2830 *
2831 * The uip will have already been set by whoever set up the
2832 * instruction.
2833 */
2834 if (block_end_offset == 0) {
2835 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2836 } else {
2837 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2838 }
2839 assert(brw_inst_uip(devinfo, insn) != 0);
2840 assert(brw_inst_jip(devinfo, insn) != 0);
2841 break;
2842 }
2843 }
2844 }
2845
2846 void brw_ff_sync(struct brw_codegen *p,
2847 struct brw_reg dest,
2848 unsigned msg_reg_nr,
2849 struct brw_reg src0,
2850 bool allocate,
2851 unsigned response_length,
2852 bool eot)
2853 {
2854 const struct gen_device_info *devinfo = p->devinfo;
2855 brw_inst *insn;
2856
2857 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2858
2859 insn = next_insn(p, BRW_OPCODE_SEND);
2860 brw_set_dest(p, insn, dest);
2861 brw_set_src0(p, insn, src0);
2862 brw_set_src1(p, insn, brw_imm_d(0));
2863
2864 if (devinfo->gen < 6)
2865 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2866
2867 brw_set_ff_sync_message(p,
2868 insn,
2869 allocate,
2870 response_length,
2871 eot);
2872 }
2873
2874 /**
2875 * Emit the SEND instruction necessary to generate stream output data on Gen6
2876 * (for transform feedback).
2877 *
2878 * If send_commit_msg is true, this is the last piece of stream output data
2879 * from this thread, so send the data as a committed write. According to the
2880 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2881 *
2882 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2883 * writes are complete by sending the final write as a committed write."
2884 */
2885 void
2886 brw_svb_write(struct brw_codegen *p,
2887 struct brw_reg dest,
2888 unsigned msg_reg_nr,
2889 struct brw_reg src0,
2890 unsigned binding_table_index,
2891 bool send_commit_msg)
2892 {
2893 brw_inst *insn;
2894
2895 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2896
2897 insn = next_insn(p, BRW_OPCODE_SEND);
2898 brw_set_dest(p, insn, dest);
2899 brw_set_src0(p, insn, src0);
2900 brw_set_src1(p, insn, brw_imm_d(0));
2901 brw_set_dp_write_message(p, insn,
2902 binding_table_index,
2903 0, /* msg_control: ignored */
2904 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2905 1, /* msg_length */
2906 true, /* header_present */
2907 0, /* last_render_target: ignored */
2908 send_commit_msg, /* response_length */
2909 0, /* end_of_thread */
2910 send_commit_msg); /* send_commit_msg */
2911 }
2912
2913 static unsigned
2914 brw_surface_payload_size(struct brw_codegen *p,
2915 unsigned num_channels,
2916 bool has_simd4x2,
2917 bool has_simd16)
2918 {
2919 if (has_simd4x2 &&
2920 brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2921 return 1;
2922 else if (has_simd16 &&
2923 brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2924 return 2 * num_channels;
2925 else
2926 return num_channels;
2927 }
2928
2929 static void
2930 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2931 brw_inst *insn,
2932 unsigned atomic_op,
2933 bool response_expected)
2934 {
2935 const struct gen_device_info *devinfo = p->devinfo;
2936 unsigned msg_control =
2937 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2938 (response_expected ? 1 << 5 : 0); /* Return data expected */
2939
2940 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2941 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2942 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2943 msg_control |= 1 << 4; /* SIMD8 mode */
2944
2945 brw_inst_set_dp_msg_type(devinfo, insn,
2946 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2947 } else {
2948 brw_inst_set_dp_msg_type(devinfo, insn,
2949 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2950 }
2951 } else {
2952 brw_inst_set_dp_msg_type(devinfo, insn,
2953 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2954
2955 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2956 msg_control |= 1 << 4; /* SIMD8 mode */
2957 }
2958
2959 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2960 }
2961
2962 void
2963 brw_untyped_atomic(struct brw_codegen *p,
2964 struct brw_reg dst,
2965 struct brw_reg payload,
2966 struct brw_reg surface,
2967 unsigned atomic_op,
2968 unsigned msg_length,
2969 bool response_expected)
2970 {
2971 const struct gen_device_info *devinfo = p->devinfo;
2972 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2973 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2974 GEN7_SFID_DATAPORT_DATA_CACHE);
2975 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2976 /* Mask out unused components -- This is especially important in Align16
2977 * mode on generations that don't have native support for SIMD4x2 atomics,
2978 * because unused but enabled components will cause the dataport to perform
2979 * additional atomic operations on the addresses that happen to be in the
2980 * uninitialized Y, Z and W coordinates of the payload.
2981 */
2982 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2983 struct brw_inst *insn = brw_send_indirect_surface_message(
2984 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2985 brw_surface_payload_size(p, response_expected,
2986 devinfo->gen >= 8 || devinfo->is_haswell, true),
2987 align1);
2988
2989 brw_set_dp_untyped_atomic_message(
2990 p, insn, atomic_op, response_expected);
2991 }
2992
2993 static void
2994 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2995 struct brw_inst *insn,
2996 unsigned num_channels)
2997 {
2998 const struct gen_device_info *devinfo = p->devinfo;
2999 /* Set mask of 32-bit channels to drop. */
3000 unsigned msg_control = 0xf & (0xf << num_channels);
3001
3002 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3003 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3004 msg_control |= 1 << 4; /* SIMD16 mode */
3005 else
3006 msg_control |= 2 << 4; /* SIMD8 mode */
3007 }
3008
3009 brw_inst_set_dp_msg_type(devinfo, insn,
3010 (devinfo->gen >= 8 || devinfo->is_haswell ?
3011 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
3012 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
3013 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3014 }
3015
3016 void
3017 brw_untyped_surface_read(struct brw_codegen *p,
3018 struct brw_reg dst,
3019 struct brw_reg payload,
3020 struct brw_reg surface,
3021 unsigned msg_length,
3022 unsigned num_channels)
3023 {
3024 const struct gen_device_info *devinfo = p->devinfo;
3025 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3026 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3027 GEN7_SFID_DATAPORT_DATA_CACHE);
3028 struct brw_inst *insn = brw_send_indirect_surface_message(
3029 p, sfid, dst, payload, surface, msg_length,
3030 brw_surface_payload_size(p, num_channels, true, true),
3031 false);
3032
3033 brw_set_dp_untyped_surface_read_message(
3034 p, insn, num_channels);
3035 }
3036
3037 static void
3038 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3039 struct brw_inst *insn,
3040 unsigned num_channels)
3041 {
3042 const struct gen_device_info *devinfo = p->devinfo;
3043 /* Set mask of 32-bit channels to drop. */
3044 unsigned msg_control = 0xf & (0xf << num_channels);
3045
3046 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3047 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3048 msg_control |= 1 << 4; /* SIMD16 mode */
3049 else
3050 msg_control |= 2 << 4; /* SIMD8 mode */
3051 } else {
3052 if (devinfo->gen >= 8 || devinfo->is_haswell)
3053 msg_control |= 0 << 4; /* SIMD4x2 mode */
3054 else
3055 msg_control |= 2 << 4; /* SIMD8 mode */
3056 }
3057
3058 brw_inst_set_dp_msg_type(devinfo, insn,
3059 devinfo->gen >= 8 || devinfo->is_haswell ?
3060 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3061 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3062 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3063 }
3064
3065 void
3066 brw_untyped_surface_write(struct brw_codegen *p,
3067 struct brw_reg payload,
3068 struct brw_reg surface,
3069 unsigned msg_length,
3070 unsigned num_channels)
3071 {
3072 const struct gen_device_info *devinfo = p->devinfo;
3073 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3074 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3075 GEN7_SFID_DATAPORT_DATA_CACHE);
3076 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3077 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3078 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3079 WRITEMASK_X : WRITEMASK_XYZW;
3080 struct brw_inst *insn = brw_send_indirect_surface_message(
3081 p, sfid, brw_writemask(brw_null_reg(), mask),
3082 payload, surface, msg_length, 0, align1);
3083
3084 brw_set_dp_untyped_surface_write_message(
3085 p, insn, num_channels);
3086 }
3087
3088 static void
3089 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3090 struct brw_inst *insn,
3091 unsigned atomic_op,
3092 bool response_expected)
3093 {
3094 const struct gen_device_info *devinfo = p->devinfo;
3095 unsigned msg_control =
3096 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3097 (response_expected ? 1 << 5 : 0); /* Return data expected */
3098
3099 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3100 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3101 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3102 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3103
3104 brw_inst_set_dp_msg_type(devinfo, insn,
3105 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3106 } else {
3107 brw_inst_set_dp_msg_type(devinfo, insn,
3108 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3109 }
3110
3111 } else {
3112 brw_inst_set_dp_msg_type(devinfo, insn,
3113 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3114
3115 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3116 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3117 }
3118
3119 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3120 }
3121
3122 void
3123 brw_typed_atomic(struct brw_codegen *p,
3124 struct brw_reg dst,
3125 struct brw_reg payload,
3126 struct brw_reg surface,
3127 unsigned atomic_op,
3128 unsigned msg_length,
3129 bool response_expected) {
3130 const struct gen_device_info *devinfo = p->devinfo;
3131 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3132 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3133 GEN6_SFID_DATAPORT_RENDER_CACHE);
3134 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3135 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3136 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3137 struct brw_inst *insn = brw_send_indirect_surface_message(
3138 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3139 brw_surface_payload_size(p, response_expected,
3140 devinfo->gen >= 8 || devinfo->is_haswell, false),
3141 true);
3142
3143 brw_set_dp_typed_atomic_message(
3144 p, insn, atomic_op, response_expected);
3145 }
3146
3147 static void
3148 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3149 struct brw_inst *insn,
3150 unsigned num_channels)
3151 {
3152 const struct gen_device_info *devinfo = p->devinfo;
3153 /* Set mask of unused channels. */
3154 unsigned msg_control = 0xf & (0xf << num_channels);
3155
3156 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3157 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3158 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3159 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3160 else
3161 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3162 }
3163
3164 brw_inst_set_dp_msg_type(devinfo, insn,
3165 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3166 } else {
3167 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3168 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3169 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3170 }
3171
3172 brw_inst_set_dp_msg_type(devinfo, insn,
3173 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3174 }
3175
3176 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3177 }
3178
3179 void
3180 brw_typed_surface_read(struct brw_codegen *p,
3181 struct brw_reg dst,
3182 struct brw_reg payload,
3183 struct brw_reg surface,
3184 unsigned msg_length,
3185 unsigned num_channels)
3186 {
3187 const struct gen_device_info *devinfo = p->devinfo;
3188 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3189 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3190 GEN6_SFID_DATAPORT_RENDER_CACHE);
3191 struct brw_inst *insn = brw_send_indirect_surface_message(
3192 p, sfid, dst, payload, surface, msg_length,
3193 brw_surface_payload_size(p, num_channels,
3194 devinfo->gen >= 8 || devinfo->is_haswell, false),
3195 true);
3196
3197 brw_set_dp_typed_surface_read_message(
3198 p, insn, num_channels);
3199 }
3200
3201 static void
3202 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3203 struct brw_inst *insn,
3204 unsigned num_channels)
3205 {
3206 const struct gen_device_info *devinfo = p->devinfo;
3207 /* Set mask of unused channels. */
3208 unsigned msg_control = 0xf & (0xf << num_channels);
3209
3210 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3211 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3212 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3213 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3214 else
3215 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3216 }
3217
3218 brw_inst_set_dp_msg_type(devinfo, insn,
3219 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3220
3221 } else {
3222 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3223 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3224 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3225 }
3226
3227 brw_inst_set_dp_msg_type(devinfo, insn,
3228 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3229 }
3230
3231 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3232 }
3233
3234 void
3235 brw_typed_surface_write(struct brw_codegen *p,
3236 struct brw_reg payload,
3237 struct brw_reg surface,
3238 unsigned msg_length,
3239 unsigned num_channels)
3240 {
3241 const struct gen_device_info *devinfo = p->devinfo;
3242 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3243 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3244 GEN6_SFID_DATAPORT_RENDER_CACHE);
3245 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3246 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3247 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3248 WRITEMASK_X : WRITEMASK_XYZW);
3249 struct brw_inst *insn = brw_send_indirect_surface_message(
3250 p, sfid, brw_writemask(brw_null_reg(), mask),
3251 payload, surface, msg_length, 0, true);
3252
3253 brw_set_dp_typed_surface_write_message(
3254 p, insn, num_channels);
3255 }
3256
3257 static void
3258 brw_set_memory_fence_message(struct brw_codegen *p,
3259 struct brw_inst *insn,
3260 enum brw_message_target sfid,
3261 bool commit_enable)
3262 {
3263 const struct gen_device_info *devinfo = p->devinfo;
3264
3265 brw_set_message_descriptor(p, insn, sfid,
3266 1 /* message length */,
3267 (commit_enable ? 1 : 0) /* response length */,
3268 true /* header present */,
3269 false);
3270
3271 switch (sfid) {
3272 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3273 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3274 break;
3275 case GEN7_SFID_DATAPORT_DATA_CACHE:
3276 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3277 break;
3278 default:
3279 unreachable("Not reached");
3280 }
3281
3282 if (commit_enable)
3283 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3284 }
3285
3286 void
3287 brw_memory_fence(struct brw_codegen *p,
3288 struct brw_reg dst)
3289 {
3290 const struct gen_device_info *devinfo = p->devinfo;
3291 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3292 struct brw_inst *insn;
3293
3294 brw_push_insn_state(p);
3295 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3296 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3297 dst = vec1(dst);
3298
3299 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3300 * message doesn't write anything back.
3301 */
3302 insn = next_insn(p, BRW_OPCODE_SEND);
3303 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3304 brw_set_dest(p, insn, dst);
3305 brw_set_src0(p, insn, dst);
3306 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3307 commit_enable);
3308
3309 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3310 /* IVB does typed surface access through the render cache, so we need to
3311 * flush it too. Use a different register so both flushes can be
3312 * pipelined by the hardware.
3313 */
3314 insn = next_insn(p, BRW_OPCODE_SEND);
3315 brw_set_dest(p, insn, offset(dst, 1));
3316 brw_set_src0(p, insn, offset(dst, 1));
3317 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3318 commit_enable);
3319
3320 /* Now write the response of the second message into the response of the
3321 * first to trigger a pipeline stall -- This way future render and data
3322 * cache messages will be properly ordered with respect to past data and
3323 * render cache messages.
3324 */
3325 brw_MOV(p, dst, offset(dst, 1));
3326 }
3327
3328 brw_pop_insn_state(p);
3329 }
3330
3331 void
3332 brw_pixel_interpolator_query(struct brw_codegen *p,
3333 struct brw_reg dest,
3334 struct brw_reg mrf,
3335 bool noperspective,
3336 unsigned mode,
3337 struct brw_reg data,
3338 unsigned msg_length,
3339 unsigned response_length)
3340 {
3341 const struct gen_device_info *devinfo = p->devinfo;
3342 struct brw_inst *insn;
3343 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3344
3345 /* brw_send_indirect_message will automatically use a direct send message
3346 * if data is actually immediate.
3347 */
3348 insn = brw_send_indirect_message(p,
3349 GEN7_SFID_PIXEL_INTERPOLATOR,
3350 dest,
3351 mrf,
3352 vec1(data));
3353 brw_inst_set_mlen(devinfo, insn, msg_length);
3354 brw_inst_set_rlen(devinfo, insn, response_length);
3355
3356 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3357 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3358 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3359 brw_inst_set_pi_message_type(devinfo, insn, mode);
3360 }
3361
3362 void
3363 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3364 struct brw_reg mask)
3365 {
3366 const struct gen_device_info *devinfo = p->devinfo;
3367 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3368 const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3369 brw_inst *inst;
3370
3371 assert(devinfo->gen >= 7);
3372 assert(mask.type == BRW_REGISTER_TYPE_UD);
3373
3374 brw_push_insn_state(p);
3375
3376 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3377 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3378
3379 if (devinfo->gen >= 8) {
3380 /* Getting the first active channel index is easy on Gen8: Just find
3381 * the first bit set in the execution mask. The register exists on
3382 * HSW already but it reads back as all ones when the current
3383 * instruction has execution masking disabled, so it's kind of
3384 * useless.
3385 */
3386 struct brw_reg exec_mask =
3387 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3388
3389 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3390 /* Unfortunately, ce0 does not take into account the thread
3391 * dispatch mask, which may be a problem in cases where it's not
3392 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3393 * some n). Combine ce0 with the given dispatch (or vector) mask
3394 * to mask off those channels which were never dispatched by the
3395 * hardware.
3396 */
3397 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3398 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3399 exec_mask = vec1(dst);
3400 }
3401
3402 /* Quarter control has the effect of magically shifting the value of
3403 * ce0 so you'll get the first active channel relative to the
3404 * specified quarter control as result.
3405 */
3406 inst = brw_FBL(p, vec1(dst), exec_mask);
3407 } else {
3408 const struct brw_reg flag = brw_flag_reg(1, 0);
3409
3410 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3411
3412 /* Run enough instructions returning zero with execution masking and
3413 * a conditional modifier enabled in order to get the full execution
3414 * mask in f1.0. We could use a single 32-wide move here if it
3415 * weren't because of the hardware bug that causes channel enables to
3416 * be applied incorrectly to the second half of 32-wide instructions
3417 * on Gen7.
3418 */
3419 const unsigned lower_size = MIN2(16, exec_size);
3420 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3421 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3422 brw_imm_uw(0));
3423 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3424 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3425 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3426 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3427 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3428 }
3429
3430 /* Find the first bit set in the exec_size-wide portion of the flag
3431 * register that was updated by the last sequence of MOV
3432 * instructions.
3433 */
3434 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3435 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3436 }
3437 } else {
3438 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3439
3440 if (devinfo->gen >= 8 &&
3441 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3442 /* In SIMD4x2 mode the first active channel index is just the
3443 * negation of the first bit of the mask register. Note that ce0
3444 * doesn't take into account the dispatch mask, so the Gen7 path
3445 * should be used instead unless you have the guarantee that the
3446 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3447 * for some n).
3448 */
3449 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3450 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3451 brw_imm_ud(1));
3452
3453 } else {
3454 /* Overwrite the destination without and with execution masking to
3455 * find out which of the channels is active.
3456 */
3457 brw_push_insn_state(p);
3458 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3459 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3460 brw_imm_ud(1));
3461
3462 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3463 brw_imm_ud(0));
3464 brw_pop_insn_state(p);
3465 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3466 }
3467 }
3468
3469 brw_pop_insn_state(p);
3470 }
3471
3472 void
3473 brw_broadcast(struct brw_codegen *p,
3474 struct brw_reg dst,
3475 struct brw_reg src,
3476 struct brw_reg idx)
3477 {
3478 const struct gen_device_info *devinfo = p->devinfo;
3479 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3480 brw_inst *inst;
3481
3482 brw_push_insn_state(p);
3483 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3484 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3485
3486 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3487 src.address_mode == BRW_ADDRESS_DIRECT);
3488
3489 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3490 idx.file == BRW_IMMEDIATE_VALUE) {
3491 /* Trivial, the source is already uniform or the index is a constant.
3492 * We will typically not get here if the optimizer is doing its job, but
3493 * asserting would be mean.
3494 */
3495 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3496 brw_MOV(p, dst,
3497 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3498 stride(suboffset(src, 4 * i), 0, 4, 1)));
3499 } else {
3500 if (align1) {
3501 const struct brw_reg addr =
3502 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3503 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3504 /* Limit in bytes of the signed indirect addressing immediate. */
3505 const unsigned limit = 512;
3506
3507 brw_push_insn_state(p);
3508 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3509 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3510
3511 /* Take into account the component size and horizontal stride. */
3512 assert(src.vstride == src.hstride + src.width);
3513 brw_SHL(p, addr, vec1(idx),
3514 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3515 src.hstride - 1));
3516
3517 /* We can only address up to limit bytes using the indirect
3518 * addressing immediate, account for the difference if the source
3519 * register is above this limit.
3520 */
3521 if (offset >= limit)
3522 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3523
3524 brw_pop_insn_state(p);
3525
3526 /* Use indirect addressing to fetch the specified component. */
3527 brw_MOV(p, dst,
3528 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3529 src.type));
3530 } else {
3531 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3532 * to all bits of a flag register,
3533 */
3534 inst = brw_MOV(p,
3535 brw_null_reg(),
3536 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3537 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3538 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3539 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3540
3541 /* and use predicated SEL to pick the right channel. */
3542 inst = brw_SEL(p, dst,
3543 stride(suboffset(src, 4), 4, 4, 1),
3544 stride(src, 4, 4, 1));
3545 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3546 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3547 }
3548 }
3549
3550 brw_pop_insn_state(p);
3551 }
3552
3553 /**
3554 * This instruction is generated as a single-channel align1 instruction by
3555 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3556 *
3557 * We can't use the typed atomic op in the FS because that has the execution
3558 * mask ANDed with the pixel mask, but we just want to write the one dword for
3559 * all the pixels.
3560 *
3561 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3562 * one u32. So we use the same untyped atomic write message as the pixel
3563 * shader.
3564 *
3565 * The untyped atomic operation requires a BUFFER surface type with RAW
3566 * format, and is only accessible through the legacy DATA_CACHE dataport
3567 * messages.
3568 */
3569 void brw_shader_time_add(struct brw_codegen *p,
3570 struct brw_reg payload,
3571 uint32_t surf_index)
3572 {
3573 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3574 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3575 GEN7_SFID_DATAPORT_DATA_CACHE);
3576 assert(p->devinfo->gen >= 7);
3577
3578 brw_push_insn_state(p);
3579 brw_set_default_access_mode(p, BRW_ALIGN_1);
3580 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3581 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3582 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3583
3584 /* We use brw_vec1_reg and unmasked because we want to increment the given
3585 * offset only once.
3586 */
3587 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3588 BRW_ARF_NULL, 0));
3589 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3590 payload.nr, 0));
3591 brw_set_src1(p, send, brw_imm_ud(0));
3592 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3593 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3594 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3595
3596 brw_pop_insn_state(p);
3597 }
3598
3599
3600 /**
3601 * Emit the SEND message for a barrier
3602 */
3603 void
3604 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3605 {
3606 const struct gen_device_info *devinfo = p->devinfo;
3607 struct brw_inst *inst;
3608
3609 assert(devinfo->gen >= 7);
3610
3611 inst = next_insn(p, BRW_OPCODE_SEND);
3612 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3613 brw_set_src0(p, inst, src);
3614 brw_set_src1(p, inst, brw_null_reg());
3615
3616 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3617 1 /* msg_length */,
3618 0 /* response_length */,
3619 false /* header_present */,
3620 false /* end_of_thread */);
3621
3622 brw_inst_set_gateway_notify(devinfo, inst, 1);
3623 brw_inst_set_gateway_subfuncid(devinfo, inst,
3624 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3625
3626 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3627 }
3628
3629
3630 /**
3631 * Emit the wait instruction for a barrier
3632 */
3633 void
3634 brw_WAIT(struct brw_codegen *p)
3635 {
3636 const struct gen_device_info *devinfo = p->devinfo;
3637 struct brw_inst *insn;
3638
3639 struct brw_reg src = brw_notification_reg();
3640
3641 insn = next_insn(p, BRW_OPCODE_WAIT);
3642 brw_set_dest(p, insn, src);
3643 brw_set_src0(p, insn, src);
3644 brw_set_src1(p, insn, brw_null_reg());
3645
3646 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3647 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3648 }