intel: Rename brw_get_device_name/info to gen_get_device_name/info
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_context.h"
34 #include "brw_defines.h"
35 #include "brw_eu.h"
36
37 #include "util/ralloc.h"
38
39 /**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case. This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46 void
47 gen6_resolve_implied_move(struct brw_codegen *p,
48 struct brw_reg *src,
49 unsigned msg_reg_nr)
50 {
51 const struct gen_device_info *devinfo = p->devinfo;
52 if (devinfo->gen < 6)
53 return;
54
55 if (src->file == BRW_MESSAGE_REGISTER_FILE)
56 return;
57
58 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct gen_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 /**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93 unsigned
94 brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
95 enum brw_reg_type type, enum brw_reg_file file)
96 {
97 if (file == BRW_IMMEDIATE_VALUE) {
98 static const int imm_hw_types[] = {
99 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
101 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
103 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
104 [BRW_REGISTER_TYPE_UB] = -1,
105 [BRW_REGISTER_TYPE_B] = -1,
106 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
109 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
113 };
114 assert(type < ARRAY_SIZE(imm_hw_types));
115 assert(imm_hw_types[type] != -1);
116 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117 return imm_hw_types[type];
118 } else {
119 /* Non-immediate registers */
120 static const int hw_types[] = {
121 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
123 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
125 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
127 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
128 [BRW_REGISTER_TYPE_UV] = -1,
129 [BRW_REGISTER_TYPE_VF] = -1,
130 [BRW_REGISTER_TYPE_V] = -1,
131 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
135 };
136 assert(type < ARRAY_SIZE(hw_types));
137 assert(hw_types[type] != -1);
138 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140 return hw_types[type];
141 }
142 }
143
144 void
145 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146 {
147 const struct gen_device_info *devinfo = p->devinfo;
148
149 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
150 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
151 else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
152 assert(dest.nr < 128);
153
154 gen7_convert_mrf_to_grf(p, &dest);
155
156 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
157 brw_inst_set_dst_reg_type(devinfo, inst,
158 brw_reg_type_to_hw_type(devinfo, dest.type,
159 dest.file));
160 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
161
162 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
163 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
164
165 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
166 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
167 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
168 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
169 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
170 } else {
171 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
172 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
173 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
174 dest.file == BRW_MESSAGE_REGISTER_FILE) {
175 assert(dest.writemask != 0);
176 }
177 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
178 * Although Dst.HorzStride is a don't care for Align16, HW needs
179 * this to be programmed as "01".
180 */
181 brw_inst_set_dst_hstride(devinfo, inst, 1);
182 }
183 } else {
184 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
185
186 /* These are different sizes in align1 vs align16:
187 */
188 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
189 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
190 dest.indirect_offset);
191 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
192 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
193 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
194 } else {
195 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
196 dest.indirect_offset);
197 /* even ignored in da16, still need to set as '01' */
198 brw_inst_set_dst_hstride(devinfo, inst, 1);
199 }
200 }
201
202 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
203 * or 16 (SIMD16), as that's normally correct. However, when dealing with
204 * small registers, we automatically reduce it to match the register size.
205 *
206 * In platforms that support fp64 we can emit instructions with a width of
207 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
208 * cases we need to make sure that these instructions have their exec sizes
209 * set properly when they are emitted and we can't rely on this code to fix
210 * it.
211 */
212 bool fix_exec_size;
213 if (devinfo->gen >= 6)
214 fix_exec_size = dest.width < BRW_EXECUTE_4;
215 else
216 fix_exec_size = dest.width < BRW_EXECUTE_8;
217
218 if (fix_exec_size)
219 brw_inst_set_exec_size(devinfo, inst, dest.width);
220 }
221
222 extern int reg_type_size[];
223
224 static void
225 validate_reg(const struct gen_device_info *devinfo,
226 brw_inst *inst, struct brw_reg reg)
227 {
228 const int hstride_for_reg[] = {0, 1, 2, 4};
229 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
230 const int width_for_reg[] = {1, 2, 4, 8, 16};
231 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
232 int width, hstride, vstride, execsize;
233
234 if (reg.file == BRW_IMMEDIATE_VALUE) {
235 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
236 * mean the destination has to be 128-bit aligned and the
237 * destination horiz stride has to be a word.
238 */
239 if (reg.type == BRW_REGISTER_TYPE_V) {
240 assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
241 reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
242 }
243
244 return;
245 }
246
247 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
248 reg.file == BRW_ARF_NULL)
249 return;
250
251 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
252 *
253 * "Swizzling is not allowed when an accumulator is used as an implicit
254 * source or an explicit source in an instruction."
255 */
256 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
257 reg.nr == BRW_ARF_ACCUMULATOR)
258 assert(reg.swizzle == BRW_SWIZZLE_XYZW);
259
260 assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
261 hstride = hstride_for_reg[reg.hstride];
262
263 if (reg.vstride == 0xf) {
264 vstride = -1;
265 } else {
266 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
267 vstride = vstride_for_reg[reg.vstride];
268 }
269
270 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
271 width = width_for_reg[reg.width];
272
273 assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
274 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
275 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
276
277 /* Restrictions from 3.3.10: Register Region Restrictions. */
278 /* 3. */
279 assert(execsize >= width);
280
281 /* 4. */
282 if (execsize == width && hstride != 0) {
283 assert(vstride == -1 || vstride == width * hstride);
284 }
285
286 /* 5. */
287 if (execsize == width && hstride == 0) {
288 /* no restriction on vstride. */
289 }
290
291 /* 6. */
292 if (width == 1) {
293 assert(hstride == 0);
294 }
295
296 /* 7. */
297 if (execsize == 1 && width == 1) {
298 assert(hstride == 0);
299 assert(vstride == 0);
300 }
301
302 /* 8. */
303 if (vstride == 0 && hstride == 0) {
304 assert(width == 1);
305 }
306
307 /* 10. Check destination issues. */
308 }
309
310 static bool
311 is_compactable_immediate(unsigned imm)
312 {
313 /* We get the low 12 bits as-is. */
314 imm &= ~0xfff;
315
316 /* We get one bit replicated through the top 20 bits. */
317 return imm == 0 || imm == 0xfffff000;
318 }
319
320 void
321 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
322 {
323 const struct gen_device_info *devinfo = p->devinfo;
324
325 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
326 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
327 else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
328 assert(reg.nr < 128);
329
330 gen7_convert_mrf_to_grf(p, &reg);
331
332 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
333 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
334 /* Any source modifiers or regions will be ignored, since this just
335 * identifies the MRF/GRF to start reading the message contents from.
336 * Check for some likely failures.
337 */
338 assert(!reg.negate);
339 assert(!reg.abs);
340 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
341 }
342
343 validate_reg(devinfo, inst, reg);
344
345 brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
346 brw_inst_set_src0_reg_type(devinfo, inst,
347 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
348 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
349 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
350 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
351
352 if (reg.file == BRW_IMMEDIATE_VALUE) {
353 if (reg.type == BRW_REGISTER_TYPE_DF ||
354 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
355 brw_inst_set_imm_df(devinfo, inst, reg.df);
356 else
357 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
358
359 /* The Bspec's section titled "Non-present Operands" claims that if src0
360 * is an immediate that src1's type must be the same as that of src0.
361 *
362 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
363 * that do not follow this rule. E.g., from the IVB/HSW table:
364 *
365 * DataTypeIndex 18-Bit Mapping Mapped Meaning
366 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
367 *
368 * And from the SNB table:
369 *
370 * DataTypeIndex 18-Bit Mapping Mapped Meaning
371 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
372 *
373 * Neither of these cause warnings from the simulator when used,
374 * compacted or otherwise. In fact, all compaction mappings that have an
375 * immediate in src0 use a:ud for src1.
376 *
377 * The GM45 instruction compaction tables do not contain mapped meanings
378 * so it's not clear whether it has the restriction. We'll assume it was
379 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
380 *
381 * Don't do any of this for 64-bit immediates, since the src1 fields
382 * overlap with the immediate and setting them would overwrite the
383 * immediate we set.
384 */
385 if (type_sz(reg.type) < 8) {
386 brw_inst_set_src1_reg_file(devinfo, inst,
387 BRW_ARCHITECTURE_REGISTER_FILE);
388 if (devinfo->gen < 6) {
389 brw_inst_set_src1_reg_type(devinfo, inst,
390 brw_inst_src0_reg_type(devinfo, inst));
391 } else {
392 brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
393 }
394 }
395
396 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
397 * for immediate values. Presumably the hardware engineers realized
398 * that the only useful floating-point value that could be represented
399 * in this format is 0.0, which can also be represented as a VF-typed
400 * immediate, so they gave us the previously mentioned mapping on IVB+.
401 *
402 * Strangely, we do have a mapping for imm:f in src1, so we don't need
403 * to do this there.
404 *
405 * If we see a 0.0:F, change the type to VF so that it can be compacted.
406 */
407 if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
408 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
409 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
410 }
411
412 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
413 * set the types to :UD so the instruction can be compacted.
414 */
415 if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
416 brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
417 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
418 brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
419 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
420 brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
421 }
422 } else {
423 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
424 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
425 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
426 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
427 } else {
428 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
429 }
430 } else {
431 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
432
433 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
434 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
435 } else {
436 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
437 }
438 }
439
440 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
441 if (reg.width == BRW_WIDTH_1 &&
442 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
443 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
444 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
445 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
446 } else {
447 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
448 brw_inst_set_src0_width(devinfo, inst, reg.width);
449 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
450 }
451 } else {
452 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
453 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
454 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
455 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
456 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
457 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
458 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
459 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
460
461 /* This is an oddity of the fact we're using the same
462 * descriptions for registers in align_16 as align_1:
463 */
464 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
465 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
466 else
467 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
468 }
469 }
470 }
471
472
473 void
474 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
475 {
476 const struct gen_device_info *devinfo = p->devinfo;
477
478 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
479 assert(reg.nr < 128);
480
481 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
482 *
483 * "Accumulator registers may be accessed explicitly as src0
484 * operands only."
485 */
486 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
487 reg.nr != BRW_ARF_ACCUMULATOR);
488
489 gen7_convert_mrf_to_grf(p, &reg);
490 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
491
492 validate_reg(devinfo, inst, reg);
493
494 brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
495 brw_inst_set_src1_reg_type(devinfo, inst,
496 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
497 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
498 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
499
500 /* Only src1 can be immediate in two-argument instructions.
501 */
502 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
503
504 if (reg.file == BRW_IMMEDIATE_VALUE) {
505 /* two-argument instructions can only use 32-bit immediates */
506 assert(type_sz(reg.type) < 8);
507 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
508 } else {
509 /* This is a hardware restriction, which may or may not be lifted
510 * in the future:
511 */
512 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
513 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
514
515 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
516 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
517 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
518 } else {
519 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
520 }
521
522 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
523 if (reg.width == BRW_WIDTH_1 &&
524 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
525 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
526 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
527 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
528 } else {
529 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
530 brw_inst_set_src1_width(devinfo, inst, reg.width);
531 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
532 }
533 } else {
534 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
535 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
536 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
537 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
538 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
539 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
540 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
541 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
542
543 /* This is an oddity of the fact we're using the same
544 * descriptions for registers in align_16 as align_1:
545 */
546 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
547 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
548 else
549 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
550 }
551 }
552 }
553
554 /**
555 * Set the Message Descriptor and Extended Message Descriptor fields
556 * for SEND messages.
557 *
558 * \note This zeroes out the Function Control bits, so it must be called
559 * \b before filling out any message-specific data. Callers can
560 * choose not to fill in irrelevant bits; they will be zero.
561 */
562 void
563 brw_set_message_descriptor(struct brw_codegen *p,
564 brw_inst *inst,
565 enum brw_message_target sfid,
566 unsigned msg_length,
567 unsigned response_length,
568 bool header_present,
569 bool end_of_thread)
570 {
571 const struct gen_device_info *devinfo = p->devinfo;
572
573 brw_set_src1(p, inst, brw_imm_d(0));
574
575 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
576 * itself; instead, it will be a MOV/OR into the address register.
577 *
578 * In this case, we avoid setting the extended message descriptor bits,
579 * since they go on the later SEND/SENDC instead and if set here would
580 * instead clobber the conditionalmod bits.
581 */
582 unsigned opcode = brw_inst_opcode(devinfo, inst);
583 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
584 brw_inst_set_sfid(devinfo, inst, sfid);
585 }
586
587 brw_inst_set_mlen(devinfo, inst, msg_length);
588 brw_inst_set_rlen(devinfo, inst, response_length);
589 brw_inst_set_eot(devinfo, inst, end_of_thread);
590
591 if (devinfo->gen >= 5) {
592 brw_inst_set_header_present(devinfo, inst, header_present);
593 }
594 }
595
596 static void brw_set_math_message( struct brw_codegen *p,
597 brw_inst *inst,
598 unsigned function,
599 unsigned integer_type,
600 bool low_precision,
601 unsigned dataType )
602 {
603 const struct gen_device_info *devinfo = p->devinfo;
604 unsigned msg_length;
605 unsigned response_length;
606
607 /* Infer message length from the function */
608 switch (function) {
609 case BRW_MATH_FUNCTION_POW:
610 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
611 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
612 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
613 msg_length = 2;
614 break;
615 default:
616 msg_length = 1;
617 break;
618 }
619
620 /* Infer response length from the function */
621 switch (function) {
622 case BRW_MATH_FUNCTION_SINCOS:
623 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
624 response_length = 2;
625 break;
626 default:
627 response_length = 1;
628 break;
629 }
630
631
632 brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
633 msg_length, response_length, false, false);
634 brw_inst_set_math_msg_function(devinfo, inst, function);
635 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
636 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
637 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
638 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
639 brw_inst_set_saturate(devinfo, inst, 0);
640 }
641
642
643 static void brw_set_ff_sync_message(struct brw_codegen *p,
644 brw_inst *insn,
645 bool allocate,
646 unsigned response_length,
647 bool end_of_thread)
648 {
649 const struct gen_device_info *devinfo = p->devinfo;
650
651 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
652 1, response_length, true, end_of_thread);
653 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
654 brw_inst_set_urb_allocate(devinfo, insn, allocate);
655 /* The following fields are not used by FF_SYNC: */
656 brw_inst_set_urb_global_offset(devinfo, insn, 0);
657 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
658 brw_inst_set_urb_used(devinfo, insn, 0);
659 brw_inst_set_urb_complete(devinfo, insn, 0);
660 }
661
662 static void brw_set_urb_message( struct brw_codegen *p,
663 brw_inst *insn,
664 enum brw_urb_write_flags flags,
665 unsigned msg_length,
666 unsigned response_length,
667 unsigned offset,
668 unsigned swizzle_control )
669 {
670 const struct gen_device_info *devinfo = p->devinfo;
671
672 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
673 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
674 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
675
676 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
677 msg_length, response_length, true,
678 flags & BRW_URB_WRITE_EOT);
679
680 if (flags & BRW_URB_WRITE_OWORD) {
681 assert(msg_length == 2); /* header + one OWORD of data */
682 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
683 } else {
684 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
685 }
686
687 brw_inst_set_urb_global_offset(devinfo, insn, offset);
688 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
689
690 if (devinfo->gen < 8) {
691 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
692 }
693
694 if (devinfo->gen < 7) {
695 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
696 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
697 } else {
698 brw_inst_set_urb_per_slot_offset(devinfo, insn,
699 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
700 }
701 }
702
703 void
704 brw_set_dp_write_message(struct brw_codegen *p,
705 brw_inst *insn,
706 unsigned binding_table_index,
707 unsigned msg_control,
708 unsigned msg_type,
709 unsigned msg_length,
710 bool header_present,
711 unsigned last_render_target,
712 unsigned response_length,
713 unsigned end_of_thread,
714 unsigned send_commit_msg)
715 {
716 const struct gen_device_info *devinfo = p->devinfo;
717 unsigned sfid;
718
719 if (devinfo->gen >= 7) {
720 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
721 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
722 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
723 else
724 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
725 } else if (devinfo->gen == 6) {
726 /* Use the render cache for all write messages. */
727 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
728 } else {
729 sfid = BRW_SFID_DATAPORT_WRITE;
730 }
731
732 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
733 header_present, end_of_thread);
734
735 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
736 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
737 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
738 brw_inst_set_rt_last(devinfo, insn, last_render_target);
739 if (devinfo->gen < 7) {
740 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
741 }
742 }
743
744 void
745 brw_set_dp_read_message(struct brw_codegen *p,
746 brw_inst *insn,
747 unsigned binding_table_index,
748 unsigned msg_control,
749 unsigned msg_type,
750 unsigned target_cache,
751 unsigned msg_length,
752 bool header_present,
753 unsigned response_length)
754 {
755 const struct gen_device_info *devinfo = p->devinfo;
756 unsigned sfid;
757
758 if (devinfo->gen >= 7) {
759 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
760 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
761 else if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
762 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
763 else if (target_cache == BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE)
764 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
765 else
766 unreachable("Invalid target cache");
767
768 } else if (devinfo->gen == 6) {
769 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
770 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
771 else
772 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
773 } else {
774 sfid = BRW_SFID_DATAPORT_READ;
775 }
776
777 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
778 header_present, false);
779
780 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
781 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
782 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
783 if (devinfo->gen < 6)
784 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
785 }
786
787 void
788 brw_set_sampler_message(struct brw_codegen *p,
789 brw_inst *inst,
790 unsigned binding_table_index,
791 unsigned sampler,
792 unsigned msg_type,
793 unsigned response_length,
794 unsigned msg_length,
795 unsigned header_present,
796 unsigned simd_mode,
797 unsigned return_format)
798 {
799 const struct gen_device_info *devinfo = p->devinfo;
800
801 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
802 response_length, header_present, false);
803
804 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
805 brw_inst_set_sampler(devinfo, inst, sampler);
806 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
807 if (devinfo->gen >= 5) {
808 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
809 } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
810 brw_inst_set_sampler_return_format(devinfo, inst, return_format);
811 }
812 }
813
814 static void
815 gen7_set_dp_scratch_message(struct brw_codegen *p,
816 brw_inst *inst,
817 bool write,
818 bool dword,
819 bool invalidate_after_read,
820 unsigned num_regs,
821 unsigned addr_offset,
822 unsigned mlen,
823 unsigned rlen,
824 bool header_present)
825 {
826 const struct gen_device_info *devinfo = p->devinfo;
827 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
828 (devinfo->gen >= 8 && num_regs == 8));
829 const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
830 num_regs - 1);
831
832 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
833 mlen, rlen, header_present, false);
834 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
835 brw_inst_set_scratch_read_write(devinfo, inst, write);
836 brw_inst_set_scratch_type(devinfo, inst, dword);
837 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
838 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
839 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
840 }
841
842 #define next_insn brw_next_insn
843 brw_inst *
844 brw_next_insn(struct brw_codegen *p, unsigned opcode)
845 {
846 const struct gen_device_info *devinfo = p->devinfo;
847 brw_inst *insn;
848
849 if (p->nr_insn + 1 > p->store_size) {
850 p->store_size <<= 1;
851 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
852 }
853
854 p->next_insn_offset += 16;
855 insn = &p->store[p->nr_insn++];
856 memcpy(insn, p->current, sizeof(*insn));
857
858 brw_inst_set_opcode(devinfo, insn, opcode);
859 return insn;
860 }
861
862 static brw_inst *
863 brw_alu1(struct brw_codegen *p, unsigned opcode,
864 struct brw_reg dest, struct brw_reg src)
865 {
866 brw_inst *insn = next_insn(p, opcode);
867 brw_set_dest(p, insn, dest);
868 brw_set_src0(p, insn, src);
869 return insn;
870 }
871
872 static brw_inst *
873 brw_alu2(struct brw_codegen *p, unsigned opcode,
874 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
875 {
876 /* 64-bit immediates are only supported on 1-src instructions */
877 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
878 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
879
880 brw_inst *insn = next_insn(p, opcode);
881 brw_set_dest(p, insn, dest);
882 brw_set_src0(p, insn, src0);
883 brw_set_src1(p, insn, src1);
884 return insn;
885 }
886
887 static int
888 get_3src_subreg_nr(struct brw_reg reg)
889 {
890 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
891 * use 32-bit units (components 0..7). Since they only support F/D/UD
892 * types, this doesn't lose any flexibility, but uses fewer bits.
893 */
894 return reg.subnr / 4;
895 }
896
897 static brw_inst *
898 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
899 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
900 {
901 const struct gen_device_info *devinfo = p->devinfo;
902 brw_inst *inst = next_insn(p, opcode);
903
904 gen7_convert_mrf_to_grf(p, &dest);
905
906 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
907
908 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
909 dest.file == BRW_MESSAGE_REGISTER_FILE);
910 assert(dest.nr < 128);
911 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
912 assert(dest.type == BRW_REGISTER_TYPE_F ||
913 dest.type == BRW_REGISTER_TYPE_DF ||
914 dest.type == BRW_REGISTER_TYPE_D ||
915 dest.type == BRW_REGISTER_TYPE_UD);
916 if (devinfo->gen == 6) {
917 brw_inst_set_3src_dst_reg_file(devinfo, inst,
918 dest.file == BRW_MESSAGE_REGISTER_FILE);
919 }
920 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
921 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
922 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
923
924 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
925 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
926 assert(src0.nr < 128);
927 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
928 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
929 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
930 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
931 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
932 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
933 src0.vstride == BRW_VERTICAL_STRIDE_0);
934
935 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
936 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
937 assert(src1.nr < 128);
938 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
939 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
940 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
941 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
942 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
943 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
944 src1.vstride == BRW_VERTICAL_STRIDE_0);
945
946 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
947 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
948 assert(src2.nr < 128);
949 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
950 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
951 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
952 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
953 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
954 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
955 src2.vstride == BRW_VERTICAL_STRIDE_0);
956
957 if (devinfo->gen >= 7) {
958 /* Set both the source and destination types based on dest.type,
959 * ignoring the source register types. The MAD and LRP emitters ensure
960 * that all four types are float. The BFE and BFI2 emitters, however,
961 * may send us mixed D and UD types and want us to ignore that and use
962 * the destination type.
963 */
964 switch (dest.type) {
965 case BRW_REGISTER_TYPE_F:
966 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
967 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
968 break;
969 case BRW_REGISTER_TYPE_DF:
970 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
971 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
972 break;
973 case BRW_REGISTER_TYPE_D:
974 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
975 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
976 break;
977 case BRW_REGISTER_TYPE_UD:
978 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
979 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
980 break;
981 default:
982 unreachable("not reached");
983 }
984 }
985
986 return inst;
987 }
988
989
990 /***********************************************************************
991 * Convenience routines.
992 */
993 #define ALU1(OP) \
994 brw_inst *brw_##OP(struct brw_codegen *p, \
995 struct brw_reg dest, \
996 struct brw_reg src0) \
997 { \
998 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
999 }
1000
1001 #define ALU2(OP) \
1002 brw_inst *brw_##OP(struct brw_codegen *p, \
1003 struct brw_reg dest, \
1004 struct brw_reg src0, \
1005 struct brw_reg src1) \
1006 { \
1007 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1008 }
1009
1010 #define ALU3(OP) \
1011 brw_inst *brw_##OP(struct brw_codegen *p, \
1012 struct brw_reg dest, \
1013 struct brw_reg src0, \
1014 struct brw_reg src1, \
1015 struct brw_reg src2) \
1016 { \
1017 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1018 }
1019
1020 #define ALU3F(OP) \
1021 brw_inst *brw_##OP(struct brw_codegen *p, \
1022 struct brw_reg dest, \
1023 struct brw_reg src0, \
1024 struct brw_reg src1, \
1025 struct brw_reg src2) \
1026 { \
1027 assert(dest.type == BRW_REGISTER_TYPE_F || \
1028 dest.type == BRW_REGISTER_TYPE_DF); \
1029 if (dest.type == BRW_REGISTER_TYPE_F) { \
1030 assert(src0.type == BRW_REGISTER_TYPE_F); \
1031 assert(src1.type == BRW_REGISTER_TYPE_F); \
1032 assert(src2.type == BRW_REGISTER_TYPE_F); \
1033 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1034 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1035 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1036 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1037 } \
1038 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1039 }
1040
1041 /* Rounding operations (other than RNDD) require two instructions - the first
1042 * stores a rounded value (possibly the wrong way) in the dest register, but
1043 * also sets a per-channel "increment bit" in the flag register. A predicated
1044 * add of 1.0 fixes dest to contain the desired result.
1045 *
1046 * Sandybridge and later appear to round correctly without an ADD.
1047 */
1048 #define ROUND(OP) \
1049 void brw_##OP(struct brw_codegen *p, \
1050 struct brw_reg dest, \
1051 struct brw_reg src) \
1052 { \
1053 const struct gen_device_info *devinfo = p->devinfo; \
1054 brw_inst *rnd, *add; \
1055 rnd = next_insn(p, BRW_OPCODE_##OP); \
1056 brw_set_dest(p, rnd, dest); \
1057 brw_set_src0(p, rnd, src); \
1058 \
1059 if (devinfo->gen < 6) { \
1060 /* turn on round-increments */ \
1061 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1062 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1063 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1064 } \
1065 }
1066
1067
1068 ALU1(MOV)
1069 ALU2(SEL)
1070 ALU1(NOT)
1071 ALU2(AND)
1072 ALU2(OR)
1073 ALU2(XOR)
1074 ALU2(SHR)
1075 ALU2(SHL)
1076 ALU1(DIM)
1077 ALU2(ASR)
1078 ALU1(FRC)
1079 ALU1(RNDD)
1080 ALU2(MAC)
1081 ALU2(MACH)
1082 ALU1(LZD)
1083 ALU2(DP4)
1084 ALU2(DPH)
1085 ALU2(DP3)
1086 ALU2(DP2)
1087 ALU3F(MAD)
1088 ALU3F(LRP)
1089 ALU1(BFREV)
1090 ALU3(BFE)
1091 ALU2(BFI1)
1092 ALU3(BFI2)
1093 ALU1(FBH)
1094 ALU1(FBL)
1095 ALU1(CBIT)
1096 ALU2(ADDC)
1097 ALU2(SUBB)
1098
1099 ROUND(RNDZ)
1100 ROUND(RNDE)
1101
1102
1103 brw_inst *
1104 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1105 struct brw_reg src0, struct brw_reg src1)
1106 {
1107 /* 6.2.2: add */
1108 if (src0.type == BRW_REGISTER_TYPE_F ||
1109 (src0.file == BRW_IMMEDIATE_VALUE &&
1110 src0.type == BRW_REGISTER_TYPE_VF)) {
1111 assert(src1.type != BRW_REGISTER_TYPE_UD);
1112 assert(src1.type != BRW_REGISTER_TYPE_D);
1113 }
1114
1115 if (src1.type == BRW_REGISTER_TYPE_F ||
1116 (src1.file == BRW_IMMEDIATE_VALUE &&
1117 src1.type == BRW_REGISTER_TYPE_VF)) {
1118 assert(src0.type != BRW_REGISTER_TYPE_UD);
1119 assert(src0.type != BRW_REGISTER_TYPE_D);
1120 }
1121
1122 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1123 }
1124
1125 brw_inst *
1126 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1127 struct brw_reg src0, struct brw_reg src1)
1128 {
1129 assert(dest.type == src0.type);
1130 assert(src0.type == src1.type);
1131 switch (src0.type) {
1132 case BRW_REGISTER_TYPE_B:
1133 case BRW_REGISTER_TYPE_UB:
1134 case BRW_REGISTER_TYPE_W:
1135 case BRW_REGISTER_TYPE_UW:
1136 case BRW_REGISTER_TYPE_D:
1137 case BRW_REGISTER_TYPE_UD:
1138 break;
1139 default:
1140 unreachable("Bad type for brw_AVG");
1141 }
1142
1143 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1144 }
1145
1146 brw_inst *
1147 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1148 struct brw_reg src0, struct brw_reg src1)
1149 {
1150 /* 6.32.38: mul */
1151 if (src0.type == BRW_REGISTER_TYPE_D ||
1152 src0.type == BRW_REGISTER_TYPE_UD ||
1153 src1.type == BRW_REGISTER_TYPE_D ||
1154 src1.type == BRW_REGISTER_TYPE_UD) {
1155 assert(dest.type != BRW_REGISTER_TYPE_F);
1156 }
1157
1158 if (src0.type == BRW_REGISTER_TYPE_F ||
1159 (src0.file == BRW_IMMEDIATE_VALUE &&
1160 src0.type == BRW_REGISTER_TYPE_VF)) {
1161 assert(src1.type != BRW_REGISTER_TYPE_UD);
1162 assert(src1.type != BRW_REGISTER_TYPE_D);
1163 }
1164
1165 if (src1.type == BRW_REGISTER_TYPE_F ||
1166 (src1.file == BRW_IMMEDIATE_VALUE &&
1167 src1.type == BRW_REGISTER_TYPE_VF)) {
1168 assert(src0.type != BRW_REGISTER_TYPE_UD);
1169 assert(src0.type != BRW_REGISTER_TYPE_D);
1170 }
1171
1172 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1173 src0.nr != BRW_ARF_ACCUMULATOR);
1174 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1175 src1.nr != BRW_ARF_ACCUMULATOR);
1176
1177 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1178 }
1179
1180 brw_inst *
1181 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1182 struct brw_reg src0, struct brw_reg src1)
1183 {
1184 src0.vstride = BRW_VERTICAL_STRIDE_0;
1185 src0.width = BRW_WIDTH_1;
1186 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1187 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1188 }
1189
1190 brw_inst *
1191 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1192 struct brw_reg src0, struct brw_reg src1)
1193 {
1194 src0.vstride = BRW_VERTICAL_STRIDE_0;
1195 src0.width = BRW_WIDTH_1;
1196 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1197 src1.vstride = BRW_VERTICAL_STRIDE_8;
1198 src1.width = BRW_WIDTH_8;
1199 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1200 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1201 }
1202
1203 brw_inst *
1204 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1205 {
1206 const struct gen_device_info *devinfo = p->devinfo;
1207 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1208 /* The F32TO16 instruction doesn't support 32-bit destination types in
1209 * Align1 mode, and neither does the Gen8 implementation in terms of a
1210 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1211 * an undocumented feature.
1212 */
1213 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1214 (!align16 || devinfo->gen >= 8));
1215 brw_inst *inst;
1216
1217 if (align16) {
1218 assert(dst.type == BRW_REGISTER_TYPE_UD);
1219 } else {
1220 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1221 dst.type == BRW_REGISTER_TYPE_W ||
1222 dst.type == BRW_REGISTER_TYPE_UW ||
1223 dst.type == BRW_REGISTER_TYPE_HF);
1224 }
1225
1226 brw_push_insn_state(p);
1227
1228 if (needs_zero_fill) {
1229 brw_set_default_access_mode(p, BRW_ALIGN_1);
1230 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1231 }
1232
1233 if (devinfo->gen >= 8) {
1234 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1235 } else {
1236 assert(devinfo->gen == 7);
1237 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1238 }
1239
1240 if (needs_zero_fill) {
1241 brw_inst_set_no_dd_clear(devinfo, inst, true);
1242 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1243 brw_inst_set_no_dd_check(devinfo, inst, true);
1244 }
1245
1246 brw_pop_insn_state(p);
1247 return inst;
1248 }
1249
1250 brw_inst *
1251 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1252 {
1253 const struct gen_device_info *devinfo = p->devinfo;
1254 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1255
1256 if (align16) {
1257 assert(src.type == BRW_REGISTER_TYPE_UD);
1258 } else {
1259 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1260 *
1261 * Because this instruction does not have a 16-bit floating-point
1262 * type, the source data type must be Word (W). The destination type
1263 * must be F (Float).
1264 */
1265 if (src.type == BRW_REGISTER_TYPE_UD)
1266 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1267
1268 assert(src.type == BRW_REGISTER_TYPE_W ||
1269 src.type == BRW_REGISTER_TYPE_UW ||
1270 src.type == BRW_REGISTER_TYPE_HF);
1271 }
1272
1273 if (devinfo->gen >= 8) {
1274 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1275 } else {
1276 assert(devinfo->gen == 7);
1277 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1278 }
1279 }
1280
1281
1282 void brw_NOP(struct brw_codegen *p)
1283 {
1284 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1285 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_1);
1286 brw_set_dest(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1287 brw_set_src0(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1288 brw_set_src1(p, insn, brw_imm_ud(0x0));
1289 }
1290
1291
1292
1293
1294
1295 /***********************************************************************
1296 * Comparisons, if/else/endif
1297 */
1298
1299 brw_inst *
1300 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1301 unsigned predicate_control)
1302 {
1303 const struct gen_device_info *devinfo = p->devinfo;
1304 struct brw_reg ip = brw_ip_reg();
1305 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1306
1307 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1308 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1309 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1310 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1311
1312 return inst;
1313 }
1314
1315 static void
1316 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1317 {
1318 p->if_stack[p->if_stack_depth] = inst - p->store;
1319
1320 p->if_stack_depth++;
1321 if (p->if_stack_array_size <= p->if_stack_depth) {
1322 p->if_stack_array_size *= 2;
1323 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1324 p->if_stack_array_size);
1325 }
1326 }
1327
1328 static brw_inst *
1329 pop_if_stack(struct brw_codegen *p)
1330 {
1331 p->if_stack_depth--;
1332 return &p->store[p->if_stack[p->if_stack_depth]];
1333 }
1334
1335 static void
1336 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1337 {
1338 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1339 p->loop_stack_array_size *= 2;
1340 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1341 p->loop_stack_array_size);
1342 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1343 p->loop_stack_array_size);
1344 }
1345
1346 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1347 p->loop_stack_depth++;
1348 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1349 }
1350
1351 static brw_inst *
1352 get_inner_do_insn(struct brw_codegen *p)
1353 {
1354 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1355 }
1356
1357 /* EU takes the value from the flag register and pushes it onto some
1358 * sort of a stack (presumably merging with any flag value already on
1359 * the stack). Within an if block, the flags at the top of the stack
1360 * control execution on each channel of the unit, eg. on each of the
1361 * 16 pixel values in our wm programs.
1362 *
1363 * When the matching 'else' instruction is reached (presumably by
1364 * countdown of the instruction count patched in by our ELSE/ENDIF
1365 * functions), the relevant flags are inverted.
1366 *
1367 * When the matching 'endif' instruction is reached, the flags are
1368 * popped off. If the stack is now empty, normal execution resumes.
1369 */
1370 brw_inst *
1371 brw_IF(struct brw_codegen *p, unsigned execute_size)
1372 {
1373 const struct gen_device_info *devinfo = p->devinfo;
1374 brw_inst *insn;
1375
1376 insn = next_insn(p, BRW_OPCODE_IF);
1377
1378 /* Override the defaults for this instruction:
1379 */
1380 if (devinfo->gen < 6) {
1381 brw_set_dest(p, insn, brw_ip_reg());
1382 brw_set_src0(p, insn, brw_ip_reg());
1383 brw_set_src1(p, insn, brw_imm_d(0x0));
1384 } else if (devinfo->gen == 6) {
1385 brw_set_dest(p, insn, brw_imm_w(0));
1386 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1387 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1388 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1389 } else if (devinfo->gen == 7) {
1390 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1391 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1392 brw_set_src1(p, insn, brw_imm_w(0));
1393 brw_inst_set_jip(devinfo, insn, 0);
1394 brw_inst_set_uip(devinfo, insn, 0);
1395 } else {
1396 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1397 brw_set_src0(p, insn, brw_imm_d(0));
1398 brw_inst_set_jip(devinfo, insn, 0);
1399 brw_inst_set_uip(devinfo, insn, 0);
1400 }
1401
1402 brw_inst_set_exec_size(devinfo, insn, execute_size);
1403 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1404 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1405 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1406 if (!p->single_program_flow && devinfo->gen < 6)
1407 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1408
1409 push_if_stack(p, insn);
1410 p->if_depth_in_loop[p->loop_stack_depth]++;
1411 return insn;
1412 }
1413
1414 /* This function is only used for gen6-style IF instructions with an
1415 * embedded comparison (conditional modifier). It is not used on gen7.
1416 */
1417 brw_inst *
1418 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1419 struct brw_reg src0, struct brw_reg src1)
1420 {
1421 const struct gen_device_info *devinfo = p->devinfo;
1422 brw_inst *insn;
1423
1424 insn = next_insn(p, BRW_OPCODE_IF);
1425
1426 brw_set_dest(p, insn, brw_imm_w(0));
1427 brw_inst_set_exec_size(devinfo, insn,
1428 brw_inst_exec_size(devinfo, p->current));
1429 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1430 brw_set_src0(p, insn, src0);
1431 brw_set_src1(p, insn, src1);
1432
1433 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1434 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1435 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1436
1437 push_if_stack(p, insn);
1438 return insn;
1439 }
1440
1441 /**
1442 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1443 */
1444 static void
1445 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1446 brw_inst *if_inst, brw_inst *else_inst)
1447 {
1448 const struct gen_device_info *devinfo = p->devinfo;
1449
1450 /* The next instruction (where the ENDIF would be, if it existed) */
1451 brw_inst *next_inst = &p->store[p->nr_insn];
1452
1453 assert(p->single_program_flow);
1454 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1455 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1456 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1457
1458 /* Convert IF to an ADD instruction that moves the instruction pointer
1459 * to the first instruction of the ELSE block. If there is no ELSE
1460 * block, point to where ENDIF would be. Reverse the predicate.
1461 *
1462 * There's no need to execute an ENDIF since we don't need to do any
1463 * stack operations, and if we're currently executing, we just want to
1464 * continue normally.
1465 */
1466 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1467 brw_inst_set_pred_inv(devinfo, if_inst, true);
1468
1469 if (else_inst != NULL) {
1470 /* Convert ELSE to an ADD instruction that points where the ENDIF
1471 * would be.
1472 */
1473 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1474
1475 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1476 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1477 } else {
1478 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1479 }
1480 }
1481
1482 /**
1483 * Patch IF and ELSE instructions with appropriate jump targets.
1484 */
1485 static void
1486 patch_IF_ELSE(struct brw_codegen *p,
1487 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1488 {
1489 const struct gen_device_info *devinfo = p->devinfo;
1490
1491 /* We shouldn't be patching IF and ELSE instructions in single program flow
1492 * mode when gen < 6, because in single program flow mode on those
1493 * platforms, we convert flow control instructions to conditional ADDs that
1494 * operate on IP (see brw_ENDIF).
1495 *
1496 * However, on Gen6, writing to IP doesn't work in single program flow mode
1497 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1498 * not be updated by non-flow control instructions."). And on later
1499 * platforms, there is no significant benefit to converting control flow
1500 * instructions to conditional ADDs. So we do patch IF and ELSE
1501 * instructions in single program flow mode on those platforms.
1502 */
1503 if (devinfo->gen < 6)
1504 assert(!p->single_program_flow);
1505
1506 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1507 assert(endif_inst != NULL);
1508 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1509
1510 unsigned br = brw_jump_scale(devinfo);
1511
1512 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1513 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1514
1515 if (else_inst == NULL) {
1516 /* Patch IF -> ENDIF */
1517 if (devinfo->gen < 6) {
1518 /* Turn it into an IFF, which means no mask stack operations for
1519 * all-false and jumping past the ENDIF.
1520 */
1521 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1522 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1523 br * (endif_inst - if_inst + 1));
1524 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1525 } else if (devinfo->gen == 6) {
1526 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1527 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1528 } else {
1529 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1530 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1531 }
1532 } else {
1533 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1534
1535 /* Patch IF -> ELSE */
1536 if (devinfo->gen < 6) {
1537 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1538 br * (else_inst - if_inst));
1539 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1540 } else if (devinfo->gen == 6) {
1541 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1542 br * (else_inst - if_inst + 1));
1543 }
1544
1545 /* Patch ELSE -> ENDIF */
1546 if (devinfo->gen < 6) {
1547 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1548 * matching ENDIF.
1549 */
1550 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1551 br * (endif_inst - else_inst + 1));
1552 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1553 } else if (devinfo->gen == 6) {
1554 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1555 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1556 br * (endif_inst - else_inst));
1557 } else {
1558 /* The IF instruction's JIP should point just past the ELSE */
1559 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1560 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1561 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1562 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1563 if (devinfo->gen >= 8) {
1564 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1565 * should point to ENDIF.
1566 */
1567 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1568 }
1569 }
1570 }
1571 }
1572
1573 void
1574 brw_ELSE(struct brw_codegen *p)
1575 {
1576 const struct gen_device_info *devinfo = p->devinfo;
1577 brw_inst *insn;
1578
1579 insn = next_insn(p, BRW_OPCODE_ELSE);
1580
1581 if (devinfo->gen < 6) {
1582 brw_set_dest(p, insn, brw_ip_reg());
1583 brw_set_src0(p, insn, brw_ip_reg());
1584 brw_set_src1(p, insn, brw_imm_d(0x0));
1585 } else if (devinfo->gen == 6) {
1586 brw_set_dest(p, insn, brw_imm_w(0));
1587 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1588 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1590 } else if (devinfo->gen == 7) {
1591 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1592 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1593 brw_set_src1(p, insn, brw_imm_w(0));
1594 brw_inst_set_jip(devinfo, insn, 0);
1595 brw_inst_set_uip(devinfo, insn, 0);
1596 } else {
1597 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1598 brw_set_src0(p, insn, brw_imm_d(0));
1599 brw_inst_set_jip(devinfo, insn, 0);
1600 brw_inst_set_uip(devinfo, insn, 0);
1601 }
1602
1603 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1604 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1605 if (!p->single_program_flow && devinfo->gen < 6)
1606 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1607
1608 push_if_stack(p, insn);
1609 }
1610
1611 void
1612 brw_ENDIF(struct brw_codegen *p)
1613 {
1614 const struct gen_device_info *devinfo = p->devinfo;
1615 brw_inst *insn = NULL;
1616 brw_inst *else_inst = NULL;
1617 brw_inst *if_inst = NULL;
1618 brw_inst *tmp;
1619 bool emit_endif = true;
1620
1621 /* In single program flow mode, we can express IF and ELSE instructions
1622 * equivalently as ADD instructions that operate on IP. On platforms prior
1623 * to Gen6, flow control instructions cause an implied thread switch, so
1624 * this is a significant savings.
1625 *
1626 * However, on Gen6, writing to IP doesn't work in single program flow mode
1627 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1628 * not be updated by non-flow control instructions."). And on later
1629 * platforms, there is no significant benefit to converting control flow
1630 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1631 * Gen5.
1632 */
1633 if (devinfo->gen < 6 && p->single_program_flow)
1634 emit_endif = false;
1635
1636 /*
1637 * A single next_insn() may change the base address of instruction store
1638 * memory(p->store), so call it first before referencing the instruction
1639 * store pointer from an index
1640 */
1641 if (emit_endif)
1642 insn = next_insn(p, BRW_OPCODE_ENDIF);
1643
1644 /* Pop the IF and (optional) ELSE instructions from the stack */
1645 p->if_depth_in_loop[p->loop_stack_depth]--;
1646 tmp = pop_if_stack(p);
1647 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1648 else_inst = tmp;
1649 tmp = pop_if_stack(p);
1650 }
1651 if_inst = tmp;
1652
1653 if (!emit_endif) {
1654 /* ENDIF is useless; don't bother emitting it. */
1655 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1656 return;
1657 }
1658
1659 if (devinfo->gen < 6) {
1660 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1661 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1662 brw_set_src1(p, insn, brw_imm_d(0x0));
1663 } else if (devinfo->gen == 6) {
1664 brw_set_dest(p, insn, brw_imm_w(0));
1665 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1666 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1667 } else if (devinfo->gen == 7) {
1668 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1669 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1670 brw_set_src1(p, insn, brw_imm_w(0));
1671 } else {
1672 brw_set_src0(p, insn, brw_imm_d(0));
1673 }
1674
1675 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1676 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1677 if (devinfo->gen < 6)
1678 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1679
1680 /* Also pop item off the stack in the endif instruction: */
1681 if (devinfo->gen < 6) {
1682 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1683 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1684 } else if (devinfo->gen == 6) {
1685 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1686 } else {
1687 brw_inst_set_jip(devinfo, insn, 2);
1688 }
1689 patch_IF_ELSE(p, if_inst, else_inst, insn);
1690 }
1691
1692 brw_inst *
1693 brw_BREAK(struct brw_codegen *p)
1694 {
1695 const struct gen_device_info *devinfo = p->devinfo;
1696 brw_inst *insn;
1697
1698 insn = next_insn(p, BRW_OPCODE_BREAK);
1699 if (devinfo->gen >= 8) {
1700 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1701 brw_set_src0(p, insn, brw_imm_d(0x0));
1702 } else if (devinfo->gen >= 6) {
1703 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1704 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1705 brw_set_src1(p, insn, brw_imm_d(0x0));
1706 } else {
1707 brw_set_dest(p, insn, brw_ip_reg());
1708 brw_set_src0(p, insn, brw_ip_reg());
1709 brw_set_src1(p, insn, brw_imm_d(0x0));
1710 brw_inst_set_gen4_pop_count(devinfo, insn,
1711 p->if_depth_in_loop[p->loop_stack_depth]);
1712 }
1713 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1714 brw_inst_set_exec_size(devinfo, insn,
1715 brw_inst_exec_size(devinfo, p->current));
1716
1717 return insn;
1718 }
1719
1720 brw_inst *
1721 brw_CONT(struct brw_codegen *p)
1722 {
1723 const struct gen_device_info *devinfo = p->devinfo;
1724 brw_inst *insn;
1725
1726 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1727 brw_set_dest(p, insn, brw_ip_reg());
1728 if (devinfo->gen >= 8) {
1729 brw_set_src0(p, insn, brw_imm_d(0x0));
1730 } else {
1731 brw_set_src0(p, insn, brw_ip_reg());
1732 brw_set_src1(p, insn, brw_imm_d(0x0));
1733 }
1734
1735 if (devinfo->gen < 6) {
1736 brw_inst_set_gen4_pop_count(devinfo, insn,
1737 p->if_depth_in_loop[p->loop_stack_depth]);
1738 }
1739 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1740 brw_inst_set_exec_size(devinfo, insn,
1741 brw_inst_exec_size(devinfo, p->current));
1742 return insn;
1743 }
1744
1745 brw_inst *
1746 gen6_HALT(struct brw_codegen *p)
1747 {
1748 const struct gen_device_info *devinfo = p->devinfo;
1749 brw_inst *insn;
1750
1751 insn = next_insn(p, BRW_OPCODE_HALT);
1752 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1753 if (devinfo->gen >= 8) {
1754 brw_set_src0(p, insn, brw_imm_d(0x0));
1755 } else {
1756 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1757 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1758 }
1759
1760 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1761 brw_inst_set_exec_size(devinfo, insn,
1762 brw_inst_exec_size(devinfo, p->current));
1763 return insn;
1764 }
1765
1766 /* DO/WHILE loop:
1767 *
1768 * The DO/WHILE is just an unterminated loop -- break or continue are
1769 * used for control within the loop. We have a few ways they can be
1770 * done.
1771 *
1772 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1773 * jip and no DO instruction.
1774 *
1775 * For non-uniform control flow pre-gen6, there's a DO instruction to
1776 * push the mask, and a WHILE to jump back, and BREAK to get out and
1777 * pop the mask.
1778 *
1779 * For gen6, there's no more mask stack, so no need for DO. WHILE
1780 * just points back to the first instruction of the loop.
1781 */
1782 brw_inst *
1783 brw_DO(struct brw_codegen *p, unsigned execute_size)
1784 {
1785 const struct gen_device_info *devinfo = p->devinfo;
1786
1787 if (devinfo->gen >= 6 || p->single_program_flow) {
1788 push_loop_stack(p, &p->store[p->nr_insn]);
1789 return &p->store[p->nr_insn];
1790 } else {
1791 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1792
1793 push_loop_stack(p, insn);
1794
1795 /* Override the defaults for this instruction:
1796 */
1797 brw_set_dest(p, insn, brw_null_reg());
1798 brw_set_src0(p, insn, brw_null_reg());
1799 brw_set_src1(p, insn, brw_null_reg());
1800
1801 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1802 brw_inst_set_exec_size(devinfo, insn, execute_size);
1803 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1804
1805 return insn;
1806 }
1807 }
1808
1809 /**
1810 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1811 * instruction here.
1812 *
1813 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1814 * nesting, since it can always just point to the end of the block/current loop.
1815 */
1816 static void
1817 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1818 {
1819 const struct gen_device_info *devinfo = p->devinfo;
1820 brw_inst *do_inst = get_inner_do_insn(p);
1821 brw_inst *inst;
1822 unsigned br = brw_jump_scale(devinfo);
1823
1824 assert(devinfo->gen < 6);
1825
1826 for (inst = while_inst - 1; inst != do_inst; inst--) {
1827 /* If the jump count is != 0, that means that this instruction has already
1828 * been patched because it's part of a loop inside of the one we're
1829 * patching.
1830 */
1831 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1832 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1833 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1834 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1835 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1836 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1837 }
1838 }
1839 }
1840
1841 brw_inst *
1842 brw_WHILE(struct brw_codegen *p)
1843 {
1844 const struct gen_device_info *devinfo = p->devinfo;
1845 brw_inst *insn, *do_insn;
1846 unsigned br = brw_jump_scale(devinfo);
1847
1848 if (devinfo->gen >= 6) {
1849 insn = next_insn(p, BRW_OPCODE_WHILE);
1850 do_insn = get_inner_do_insn(p);
1851
1852 if (devinfo->gen >= 8) {
1853 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1854 brw_set_src0(p, insn, brw_imm_d(0));
1855 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1856 } else if (devinfo->gen == 7) {
1857 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1858 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1859 brw_set_src1(p, insn, brw_imm_w(0));
1860 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1861 } else {
1862 brw_set_dest(p, insn, brw_imm_w(0));
1863 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1864 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1865 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1866 }
1867
1868 brw_inst_set_exec_size(devinfo, insn,
1869 brw_inst_exec_size(devinfo, p->current));
1870
1871 } else {
1872 if (p->single_program_flow) {
1873 insn = next_insn(p, BRW_OPCODE_ADD);
1874 do_insn = get_inner_do_insn(p);
1875
1876 brw_set_dest(p, insn, brw_ip_reg());
1877 brw_set_src0(p, insn, brw_ip_reg());
1878 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1879 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1880 } else {
1881 insn = next_insn(p, BRW_OPCODE_WHILE);
1882 do_insn = get_inner_do_insn(p);
1883
1884 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1885
1886 brw_set_dest(p, insn, brw_ip_reg());
1887 brw_set_src0(p, insn, brw_ip_reg());
1888 brw_set_src1(p, insn, brw_imm_d(0));
1889
1890 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1891 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1892 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1893
1894 brw_patch_break_cont(p, insn);
1895 }
1896 }
1897 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1898
1899 p->loop_stack_depth--;
1900
1901 return insn;
1902 }
1903
1904 /* FORWARD JUMPS:
1905 */
1906 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1907 {
1908 const struct gen_device_info *devinfo = p->devinfo;
1909 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1910 unsigned jmpi = 1;
1911
1912 if (devinfo->gen >= 5)
1913 jmpi = 2;
1914
1915 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1916 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1917
1918 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1919 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1920 }
1921
1922 /* To integrate with the above, it makes sense that the comparison
1923 * instruction should populate the flag register. It might be simpler
1924 * just to use the flag reg for most WM tasks?
1925 */
1926 void brw_CMP(struct brw_codegen *p,
1927 struct brw_reg dest,
1928 unsigned conditional,
1929 struct brw_reg src0,
1930 struct brw_reg src1)
1931 {
1932 const struct gen_device_info *devinfo = p->devinfo;
1933 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1934
1935 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1936 brw_set_dest(p, insn, dest);
1937 brw_set_src0(p, insn, src0);
1938 brw_set_src1(p, insn, src1);
1939
1940 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1941 * page says:
1942 * "Any CMP instruction with a null destination must use a {switch}."
1943 *
1944 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1945 * mentioned on their work-arounds pages.
1946 */
1947 if (devinfo->gen == 7) {
1948 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1949 dest.nr == BRW_ARF_NULL) {
1950 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1951 }
1952 }
1953 }
1954
1955 /***********************************************************************
1956 * Helpers for the various SEND message types:
1957 */
1958
1959 /** Extended math function, float[8].
1960 */
1961 void gen4_math(struct brw_codegen *p,
1962 struct brw_reg dest,
1963 unsigned function,
1964 unsigned msg_reg_nr,
1965 struct brw_reg src,
1966 unsigned precision )
1967 {
1968 const struct gen_device_info *devinfo = p->devinfo;
1969 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1970 unsigned data_type;
1971 if (has_scalar_region(src)) {
1972 data_type = BRW_MATH_DATA_SCALAR;
1973 } else {
1974 data_type = BRW_MATH_DATA_VECTOR;
1975 }
1976
1977 assert(devinfo->gen < 6);
1978
1979 /* Example code doesn't set predicate_control for send
1980 * instructions.
1981 */
1982 brw_inst_set_pred_control(devinfo, insn, 0);
1983 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1984
1985 brw_set_dest(p, insn, dest);
1986 brw_set_src0(p, insn, src);
1987 brw_set_math_message(p,
1988 insn,
1989 function,
1990 src.type == BRW_REGISTER_TYPE_D,
1991 precision,
1992 data_type);
1993 }
1994
1995 void gen6_math(struct brw_codegen *p,
1996 struct brw_reg dest,
1997 unsigned function,
1998 struct brw_reg src0,
1999 struct brw_reg src1)
2000 {
2001 const struct gen_device_info *devinfo = p->devinfo;
2002 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2003
2004 assert(devinfo->gen >= 6);
2005
2006 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2007 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2008 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
2009 (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
2010
2011 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2012 if (devinfo->gen == 6) {
2013 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2014 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2015 }
2016
2017 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2018 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2019 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2020 assert(src0.type != BRW_REGISTER_TYPE_F);
2021 assert(src1.type != BRW_REGISTER_TYPE_F);
2022 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2023 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2024 } else {
2025 assert(src0.type == BRW_REGISTER_TYPE_F);
2026 assert(src1.type == BRW_REGISTER_TYPE_F);
2027 if (function == BRW_MATH_FUNCTION_POW) {
2028 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2029 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2030 } else {
2031 assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2032 src1.nr == BRW_ARF_NULL);
2033 }
2034 }
2035
2036 /* Source modifiers are ignored for extended math instructions on Gen6. */
2037 if (devinfo->gen == 6) {
2038 assert(!src0.negate);
2039 assert(!src0.abs);
2040 assert(!src1.negate);
2041 assert(!src1.abs);
2042 }
2043
2044 brw_inst_set_math_function(devinfo, insn, function);
2045
2046 brw_set_dest(p, insn, dest);
2047 brw_set_src0(p, insn, src0);
2048 brw_set_src1(p, insn, src1);
2049 }
2050
2051 /**
2052 * Return the right surface index to access the thread scratch space using
2053 * stateless dataport messages.
2054 */
2055 unsigned
2056 brw_scratch_surface_idx(const struct brw_codegen *p)
2057 {
2058 /* The scratch space is thread-local so IA coherency is unnecessary. */
2059 if (p->devinfo->gen >= 8)
2060 return GEN8_BTI_STATELESS_NON_COHERENT;
2061 else
2062 return BRW_BTI_STATELESS;
2063 }
2064
2065 /**
2066 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2067 * using a constant offset per channel.
2068 *
2069 * The offset must be aligned to oword size (16 bytes). Used for
2070 * register spilling.
2071 */
2072 void brw_oword_block_write_scratch(struct brw_codegen *p,
2073 struct brw_reg mrf,
2074 int num_regs,
2075 unsigned offset)
2076 {
2077 const struct gen_device_info *devinfo = p->devinfo;
2078 uint32_t msg_type;
2079
2080 if (devinfo->gen >= 6)
2081 offset /= 16;
2082
2083 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2084
2085 const unsigned mlen = 1 + num_regs;
2086 const unsigned msg_control =
2087 (num_regs == 1 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :
2088 num_regs == 2 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :
2089 num_regs == 4 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : 0);
2090 assert(msg_control);
2091
2092 /* Set up the message header. This is g0, with g0.2 filled with
2093 * the offset. We don't want to leave our offset around in g0 or
2094 * it'll screw up texture samples, so set it up inside the message
2095 * reg.
2096 */
2097 {
2098 brw_push_insn_state(p);
2099 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2100 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2101 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2102
2103 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2104
2105 /* set message header global offset field (reg 0, element 2) */
2106 brw_MOV(p,
2107 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2108 mrf.nr,
2109 2), BRW_REGISTER_TYPE_UD),
2110 brw_imm_ud(offset));
2111
2112 brw_pop_insn_state(p);
2113 }
2114
2115 {
2116 struct brw_reg dest;
2117 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2118 int send_commit_msg;
2119 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2120 BRW_REGISTER_TYPE_UW);
2121
2122 brw_inst_set_compression(devinfo, insn, false);
2123
2124 if (brw_inst_exec_size(devinfo, insn) >= 16)
2125 src_header = vec16(src_header);
2126
2127 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2128 if (devinfo->gen < 6)
2129 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2130
2131 /* Until gen6, writes followed by reads from the same location
2132 * are not guaranteed to be ordered unless write_commit is set.
2133 * If set, then a no-op write is issued to the destination
2134 * register to set a dependency, and a read from the destination
2135 * can be used to ensure the ordering.
2136 *
2137 * For gen6, only writes between different threads need ordering
2138 * protection. Our use of DP writes is all about register
2139 * spilling within a thread.
2140 */
2141 if (devinfo->gen >= 6) {
2142 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2143 send_commit_msg = 0;
2144 } else {
2145 dest = src_header;
2146 send_commit_msg = 1;
2147 }
2148
2149 brw_set_dest(p, insn, dest);
2150 if (devinfo->gen >= 6) {
2151 brw_set_src0(p, insn, mrf);
2152 } else {
2153 brw_set_src0(p, insn, brw_null_reg());
2154 }
2155
2156 if (devinfo->gen >= 6)
2157 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2158 else
2159 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2160
2161 brw_set_dp_write_message(p,
2162 insn,
2163 brw_scratch_surface_idx(p),
2164 msg_control,
2165 msg_type,
2166 mlen,
2167 true, /* header_present */
2168 0, /* not a render target */
2169 send_commit_msg, /* response_length */
2170 0, /* eot */
2171 send_commit_msg);
2172 }
2173 }
2174
2175
2176 /**
2177 * Read a block of owords (half a GRF each) from the scratch buffer
2178 * using a constant index per channel.
2179 *
2180 * Offset must be aligned to oword size (16 bytes). Used for register
2181 * spilling.
2182 */
2183 void
2184 brw_oword_block_read_scratch(struct brw_codegen *p,
2185 struct brw_reg dest,
2186 struct brw_reg mrf,
2187 int num_regs,
2188 unsigned offset)
2189 {
2190 const struct gen_device_info *devinfo = p->devinfo;
2191
2192 if (devinfo->gen >= 6)
2193 offset /= 16;
2194
2195 if (p->devinfo->gen >= 7) {
2196 /* On gen 7 and above, we no longer have message registers and we can
2197 * send from any register we want. By using the destination register
2198 * for the message, we guarantee that the implied message write won't
2199 * accidentally overwrite anything. This has been a problem because
2200 * the MRF registers and source for the final FB write are both fixed
2201 * and may overlap.
2202 */
2203 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2204 } else {
2205 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2206 }
2207 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2208
2209 const unsigned rlen = num_regs;
2210 const unsigned msg_control =
2211 (num_regs == 1 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :
2212 num_regs == 2 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :
2213 num_regs == 4 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : 0);
2214 assert(msg_control);
2215 const unsigned target_cache = devinfo->gen >= 7 ?
2216 BRW_DATAPORT_READ_TARGET_DATA_CACHE :
2217 BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
2218
2219 {
2220 brw_push_insn_state(p);
2221 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2222 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2223 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2224
2225 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2226
2227 /* set message header global offset field (reg 0, element 2) */
2228 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2229
2230 brw_pop_insn_state(p);
2231 }
2232
2233 {
2234 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2235
2236 assert(brw_inst_pred_control(devinfo, insn) == 0);
2237 brw_inst_set_compression(devinfo, insn, false);
2238
2239 brw_set_dest(p, insn, dest); /* UW? */
2240 if (devinfo->gen >= 6) {
2241 brw_set_src0(p, insn, mrf);
2242 } else {
2243 brw_set_src0(p, insn, brw_null_reg());
2244 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2245 }
2246
2247 brw_set_dp_read_message(p,
2248 insn,
2249 brw_scratch_surface_idx(p),
2250 msg_control,
2251 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2252 target_cache,
2253 1, /* msg_length */
2254 true, /* header_present */
2255 rlen);
2256 }
2257 }
2258
2259 void
2260 gen7_block_read_scratch(struct brw_codegen *p,
2261 struct brw_reg dest,
2262 int num_regs,
2263 unsigned offset)
2264 {
2265 const struct gen_device_info *devinfo = p->devinfo;
2266 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2267 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2268
2269 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2270
2271 /* The HW requires that the header is present; this is to get the g0.5
2272 * scratch offset.
2273 */
2274 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2275
2276 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2277 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2278 * is 32 bytes, which happens to be the size of a register.
2279 */
2280 offset /= REG_SIZE;
2281 assert(offset < (1 << 12));
2282
2283 gen7_set_dp_scratch_message(p, insn,
2284 false, /* scratch read */
2285 false, /* OWords */
2286 false, /* invalidate after read */
2287 num_regs,
2288 offset,
2289 1, /* mlen: just g0 */
2290 num_regs, /* rlen */
2291 true); /* header present */
2292 }
2293
2294 /**
2295 * Read a float[4] vector from the data port Data Cache (const buffer).
2296 * Location (in buffer) should be a multiple of 16.
2297 * Used for fetching shader constants.
2298 */
2299 void brw_oword_block_read(struct brw_codegen *p,
2300 struct brw_reg dest,
2301 struct brw_reg mrf,
2302 uint32_t offset,
2303 uint32_t bind_table_index)
2304 {
2305 const struct gen_device_info *devinfo = p->devinfo;
2306
2307 /* On newer hardware, offset is in units of owords. */
2308 if (devinfo->gen >= 6)
2309 offset /= 16;
2310
2311 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2312
2313 brw_push_insn_state(p);
2314 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2315 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2316 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2317 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2318
2319 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2320
2321 /* set message header global offset field (reg 0, element 2) */
2322 brw_MOV(p,
2323 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2324 mrf.nr,
2325 2), BRW_REGISTER_TYPE_UD),
2326 brw_imm_ud(offset));
2327
2328 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2329
2330 /* cast dest to a uword[8] vector */
2331 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2332
2333 brw_set_dest(p, insn, dest);
2334 if (devinfo->gen >= 6) {
2335 brw_set_src0(p, insn, mrf);
2336 } else {
2337 brw_set_src0(p, insn, brw_null_reg());
2338 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2339 }
2340
2341 brw_set_dp_read_message(p,
2342 insn,
2343 bind_table_index,
2344 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2345 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2346 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2347 1, /* msg_length */
2348 true, /* header_present */
2349 1); /* response_length (1 reg, 2 owords!) */
2350
2351 brw_pop_insn_state(p);
2352 }
2353
2354
2355 void brw_fb_WRITE(struct brw_codegen *p,
2356 struct brw_reg payload,
2357 struct brw_reg implied_header,
2358 unsigned msg_control,
2359 unsigned binding_table_index,
2360 unsigned msg_length,
2361 unsigned response_length,
2362 bool eot,
2363 bool last_render_target,
2364 bool header_present)
2365 {
2366 const struct gen_device_info *devinfo = p->devinfo;
2367 brw_inst *insn;
2368 unsigned msg_type;
2369 struct brw_reg dest, src0;
2370
2371 if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2372 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2373 else
2374 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2375
2376 if (devinfo->gen >= 6) {
2377 insn = next_insn(p, BRW_OPCODE_SENDC);
2378 } else {
2379 insn = next_insn(p, BRW_OPCODE_SEND);
2380 }
2381 brw_inst_set_compression(devinfo, insn, false);
2382
2383 if (devinfo->gen >= 6) {
2384 /* headerless version, just submit color payload */
2385 src0 = payload;
2386
2387 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2388 } else {
2389 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2390 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2391 src0 = implied_header;
2392
2393 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2394 }
2395
2396 brw_set_dest(p, insn, dest);
2397 brw_set_src0(p, insn, src0);
2398 brw_set_dp_write_message(p,
2399 insn,
2400 binding_table_index,
2401 msg_control,
2402 msg_type,
2403 msg_length,
2404 header_present,
2405 last_render_target,
2406 response_length,
2407 eot,
2408 0 /* send_commit_msg */);
2409 }
2410
2411 brw_inst *
2412 gen9_fb_READ(struct brw_codegen *p,
2413 struct brw_reg dst,
2414 struct brw_reg payload,
2415 unsigned binding_table_index,
2416 unsigned msg_length,
2417 unsigned response_length,
2418 bool per_sample)
2419 {
2420 const struct gen_device_info *devinfo = p->devinfo;
2421 assert(devinfo->gen >= 9);
2422 const unsigned msg_subtype =
2423 brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2424 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2425
2426 brw_set_dest(p, insn, dst);
2427 brw_set_src0(p, insn, payload);
2428 brw_set_dp_read_message(p, insn, binding_table_index,
2429 per_sample << 5 | msg_subtype,
2430 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2431 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2432 msg_length, true /* header_present */,
2433 response_length);
2434 brw_inst_set_rt_slot_group(devinfo, insn,
2435 brw_inst_qtr_control(devinfo, p->current) / 2);
2436
2437 return insn;
2438 }
2439
2440 /**
2441 * Texture sample instruction.
2442 * Note: the msg_type plus msg_length values determine exactly what kind
2443 * of sampling operation is performed. See volume 4, page 161 of docs.
2444 */
2445 void brw_SAMPLE(struct brw_codegen *p,
2446 struct brw_reg dest,
2447 unsigned msg_reg_nr,
2448 struct brw_reg src0,
2449 unsigned binding_table_index,
2450 unsigned sampler,
2451 unsigned msg_type,
2452 unsigned response_length,
2453 unsigned msg_length,
2454 unsigned header_present,
2455 unsigned simd_mode,
2456 unsigned return_format)
2457 {
2458 const struct gen_device_info *devinfo = p->devinfo;
2459 brw_inst *insn;
2460
2461 if (msg_reg_nr != -1)
2462 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2463
2464 insn = next_insn(p, BRW_OPCODE_SEND);
2465 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2466
2467 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2468 *
2469 * "Instruction compression is not allowed for this instruction (that
2470 * is, send). The hardware behavior is undefined if this instruction is
2471 * set as compressed. However, compress control can be set to "SecHalf"
2472 * to affect the EMask generation."
2473 *
2474 * No similar wording is found in later PRMs, but there are examples
2475 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2476 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2477 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2478 */
2479 brw_inst_set_compression(devinfo, insn, false);
2480
2481 if (devinfo->gen < 6)
2482 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2483
2484 brw_set_dest(p, insn, dest);
2485 brw_set_src0(p, insn, src0);
2486 brw_set_sampler_message(p, insn,
2487 binding_table_index,
2488 sampler,
2489 msg_type,
2490 response_length,
2491 msg_length,
2492 header_present,
2493 simd_mode,
2494 return_format);
2495 }
2496
2497 /* Adjust the message header's sampler state pointer to
2498 * select the correct group of 16 samplers.
2499 */
2500 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2501 struct brw_reg header,
2502 struct brw_reg sampler_index)
2503 {
2504 /* The "Sampler Index" field can only store values between 0 and 15.
2505 * However, we can add an offset to the "Sampler State Pointer"
2506 * field, effectively selecting a different set of 16 samplers.
2507 *
2508 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2509 * offset, and each sampler state is only 16-bytes, so we can't
2510 * exclusively use the offset - we have to use both.
2511 */
2512
2513 const struct gen_device_info *devinfo = p->devinfo;
2514
2515 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2516 const int sampler_state_size = 16; /* 16 bytes */
2517 uint32_t sampler = sampler_index.ud;
2518
2519 if (sampler >= 16) {
2520 assert(devinfo->is_haswell || devinfo->gen >= 8);
2521 brw_ADD(p,
2522 get_element_ud(header, 3),
2523 get_element_ud(brw_vec8_grf(0, 0), 3),
2524 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2525 }
2526 } else {
2527 /* Non-const sampler array indexing case */
2528 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2529 return;
2530 }
2531
2532 struct brw_reg temp = get_element_ud(header, 3);
2533
2534 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2535 brw_SHL(p, temp, temp, brw_imm_ud(4));
2536 brw_ADD(p,
2537 get_element_ud(header, 3),
2538 get_element_ud(brw_vec8_grf(0, 0), 3),
2539 temp);
2540 }
2541 }
2542
2543 /* All these variables are pretty confusing - we might be better off
2544 * using bitmasks and macros for this, in the old style. Or perhaps
2545 * just having the caller instantiate the fields in dword3 itself.
2546 */
2547 void brw_urb_WRITE(struct brw_codegen *p,
2548 struct brw_reg dest,
2549 unsigned msg_reg_nr,
2550 struct brw_reg src0,
2551 enum brw_urb_write_flags flags,
2552 unsigned msg_length,
2553 unsigned response_length,
2554 unsigned offset,
2555 unsigned swizzle)
2556 {
2557 const struct gen_device_info *devinfo = p->devinfo;
2558 brw_inst *insn;
2559
2560 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2561
2562 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2563 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2564 brw_push_insn_state(p);
2565 brw_set_default_access_mode(p, BRW_ALIGN_1);
2566 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2567 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2568 BRW_REGISTER_TYPE_UD),
2569 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2570 brw_imm_ud(0xff00));
2571 brw_pop_insn_state(p);
2572 }
2573
2574 insn = next_insn(p, BRW_OPCODE_SEND);
2575
2576 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2577
2578 brw_set_dest(p, insn, dest);
2579 brw_set_src0(p, insn, src0);
2580 brw_set_src1(p, insn, brw_imm_d(0));
2581
2582 if (devinfo->gen < 6)
2583 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2584
2585 brw_set_urb_message(p,
2586 insn,
2587 flags,
2588 msg_length,
2589 response_length,
2590 offset,
2591 swizzle);
2592 }
2593
2594 struct brw_inst *
2595 brw_send_indirect_message(struct brw_codegen *p,
2596 unsigned sfid,
2597 struct brw_reg dst,
2598 struct brw_reg payload,
2599 struct brw_reg desc)
2600 {
2601 const struct gen_device_info *devinfo = p->devinfo;
2602 struct brw_inst *send;
2603 int setup;
2604
2605 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2606
2607 assert(desc.type == BRW_REGISTER_TYPE_UD);
2608
2609 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2610 * in the indirect case) by its index in the instruction store. The
2611 * pointer returned by next_insn() may become invalid if emitting the SEND
2612 * in the indirect case reallocs the store.
2613 */
2614
2615 if (desc.file == BRW_IMMEDIATE_VALUE) {
2616 setup = p->nr_insn;
2617 send = next_insn(p, BRW_OPCODE_SEND);
2618 brw_set_src1(p, send, desc);
2619
2620 } else {
2621 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2622
2623 brw_push_insn_state(p);
2624 brw_set_default_access_mode(p, BRW_ALIGN_1);
2625 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2626 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2627
2628 /* Load the indirect descriptor to an address register using OR so the
2629 * caller can specify additional descriptor bits with the usual
2630 * brw_set_*_message() helper functions.
2631 */
2632 setup = p->nr_insn;
2633 brw_OR(p, addr, desc, brw_imm_ud(0));
2634
2635 brw_pop_insn_state(p);
2636
2637 send = next_insn(p, BRW_OPCODE_SEND);
2638 brw_set_src1(p, send, addr);
2639 }
2640
2641 if (dst.width < BRW_EXECUTE_8)
2642 brw_inst_set_exec_size(devinfo, send, dst.width);
2643
2644 brw_set_dest(p, send, dst);
2645 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2646 brw_inst_set_sfid(devinfo, send, sfid);
2647
2648 return &p->store[setup];
2649 }
2650
2651 static struct brw_inst *
2652 brw_send_indirect_surface_message(struct brw_codegen *p,
2653 unsigned sfid,
2654 struct brw_reg dst,
2655 struct brw_reg payload,
2656 struct brw_reg surface,
2657 unsigned message_len,
2658 unsigned response_len,
2659 bool header_present)
2660 {
2661 const struct gen_device_info *devinfo = p->devinfo;
2662 struct brw_inst *insn;
2663
2664 if (surface.file != BRW_IMMEDIATE_VALUE) {
2665 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2666
2667 brw_push_insn_state(p);
2668 brw_set_default_access_mode(p, BRW_ALIGN_1);
2669 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2670 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2671
2672 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2673 * some surface array is accessed out of bounds.
2674 */
2675 insn = brw_AND(p, addr,
2676 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2677 BRW_GET_SWZ(surface.swizzle, 0)),
2678 brw_imm_ud(0xff));
2679
2680 brw_pop_insn_state(p);
2681
2682 surface = addr;
2683 }
2684
2685 insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2686 brw_inst_set_mlen(devinfo, insn, message_len);
2687 brw_inst_set_rlen(devinfo, insn, response_len);
2688 brw_inst_set_header_present(devinfo, insn, header_present);
2689
2690 return insn;
2691 }
2692
2693 static bool
2694 while_jumps_before_offset(const struct gen_device_info *devinfo,
2695 brw_inst *insn, int while_offset, int start_offset)
2696 {
2697 int scale = 16 / brw_jump_scale(devinfo);
2698 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2699 : brw_inst_jip(devinfo, insn);
2700 return while_offset + jip * scale <= start_offset;
2701 }
2702
2703
2704 static int
2705 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2706 {
2707 int offset;
2708 void *store = p->store;
2709 const struct gen_device_info *devinfo = p->devinfo;
2710
2711 int depth = 0;
2712
2713 for (offset = next_offset(devinfo, store, start_offset);
2714 offset < p->next_insn_offset;
2715 offset = next_offset(devinfo, store, offset)) {
2716 brw_inst *insn = store + offset;
2717
2718 switch (brw_inst_opcode(devinfo, insn)) {
2719 case BRW_OPCODE_IF:
2720 depth++;
2721 break;
2722 case BRW_OPCODE_ENDIF:
2723 if (depth == 0)
2724 return offset;
2725 depth--;
2726 break;
2727 case BRW_OPCODE_WHILE:
2728 /* If the while doesn't jump before our instruction, it's the end
2729 * of a sibling do...while loop. Ignore it.
2730 */
2731 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2732 continue;
2733 /* fallthrough */
2734 case BRW_OPCODE_ELSE:
2735 case BRW_OPCODE_HALT:
2736 if (depth == 0)
2737 return offset;
2738 }
2739 }
2740
2741 return 0;
2742 }
2743
2744 /* There is no DO instruction on gen6, so to find the end of the loop
2745 * we have to see if the loop is jumping back before our start
2746 * instruction.
2747 */
2748 static int
2749 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2750 {
2751 const struct gen_device_info *devinfo = p->devinfo;
2752 int offset;
2753 void *store = p->store;
2754
2755 assert(devinfo->gen >= 6);
2756
2757 /* Always start after the instruction (such as a WHILE) we're trying to fix
2758 * up.
2759 */
2760 for (offset = next_offset(devinfo, store, start_offset);
2761 offset < p->next_insn_offset;
2762 offset = next_offset(devinfo, store, offset)) {
2763 brw_inst *insn = store + offset;
2764
2765 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2766 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2767 return offset;
2768 }
2769 }
2770 assert(!"not reached");
2771 return start_offset;
2772 }
2773
2774 /* After program generation, go back and update the UIP and JIP of
2775 * BREAK, CONT, and HALT instructions to their correct locations.
2776 */
2777 void
2778 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2779 {
2780 const struct gen_device_info *devinfo = p->devinfo;
2781 int offset;
2782 int br = brw_jump_scale(devinfo);
2783 int scale = 16 / br;
2784 void *store = p->store;
2785
2786 if (devinfo->gen < 6)
2787 return;
2788
2789 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2790 brw_inst *insn = store + offset;
2791 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2792
2793 int block_end_offset = brw_find_next_block_end(p, offset);
2794 switch (brw_inst_opcode(devinfo, insn)) {
2795 case BRW_OPCODE_BREAK:
2796 assert(block_end_offset != 0);
2797 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2798 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2799 brw_inst_set_uip(devinfo, insn,
2800 (brw_find_loop_end(p, offset) - offset +
2801 (devinfo->gen == 6 ? 16 : 0)) / scale);
2802 break;
2803 case BRW_OPCODE_CONTINUE:
2804 assert(block_end_offset != 0);
2805 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2806 brw_inst_set_uip(devinfo, insn,
2807 (brw_find_loop_end(p, offset) - offset) / scale);
2808
2809 assert(brw_inst_uip(devinfo, insn) != 0);
2810 assert(brw_inst_jip(devinfo, insn) != 0);
2811 break;
2812
2813 case BRW_OPCODE_ENDIF: {
2814 int32_t jump = (block_end_offset == 0) ?
2815 1 * br : (block_end_offset - offset) / scale;
2816 if (devinfo->gen >= 7)
2817 brw_inst_set_jip(devinfo, insn, jump);
2818 else
2819 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2820 break;
2821 }
2822
2823 case BRW_OPCODE_HALT:
2824 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2825 *
2826 * "In case of the halt instruction not inside any conditional
2827 * code block, the value of <JIP> and <UIP> should be the
2828 * same. In case of the halt instruction inside conditional code
2829 * block, the <UIP> should be the end of the program, and the
2830 * <JIP> should be end of the most inner conditional code block."
2831 *
2832 * The uip will have already been set by whoever set up the
2833 * instruction.
2834 */
2835 if (block_end_offset == 0) {
2836 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2837 } else {
2838 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2839 }
2840 assert(brw_inst_uip(devinfo, insn) != 0);
2841 assert(brw_inst_jip(devinfo, insn) != 0);
2842 break;
2843 }
2844 }
2845 }
2846
2847 void brw_ff_sync(struct brw_codegen *p,
2848 struct brw_reg dest,
2849 unsigned msg_reg_nr,
2850 struct brw_reg src0,
2851 bool allocate,
2852 unsigned response_length,
2853 bool eot)
2854 {
2855 const struct gen_device_info *devinfo = p->devinfo;
2856 brw_inst *insn;
2857
2858 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2859
2860 insn = next_insn(p, BRW_OPCODE_SEND);
2861 brw_set_dest(p, insn, dest);
2862 brw_set_src0(p, insn, src0);
2863 brw_set_src1(p, insn, brw_imm_d(0));
2864
2865 if (devinfo->gen < 6)
2866 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2867
2868 brw_set_ff_sync_message(p,
2869 insn,
2870 allocate,
2871 response_length,
2872 eot);
2873 }
2874
2875 /**
2876 * Emit the SEND instruction necessary to generate stream output data on Gen6
2877 * (for transform feedback).
2878 *
2879 * If send_commit_msg is true, this is the last piece of stream output data
2880 * from this thread, so send the data as a committed write. According to the
2881 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2882 *
2883 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2884 * writes are complete by sending the final write as a committed write."
2885 */
2886 void
2887 brw_svb_write(struct brw_codegen *p,
2888 struct brw_reg dest,
2889 unsigned msg_reg_nr,
2890 struct brw_reg src0,
2891 unsigned binding_table_index,
2892 bool send_commit_msg)
2893 {
2894 brw_inst *insn;
2895
2896 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2897
2898 insn = next_insn(p, BRW_OPCODE_SEND);
2899 brw_set_dest(p, insn, dest);
2900 brw_set_src0(p, insn, src0);
2901 brw_set_src1(p, insn, brw_imm_d(0));
2902 brw_set_dp_write_message(p, insn,
2903 binding_table_index,
2904 0, /* msg_control: ignored */
2905 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2906 1, /* msg_length */
2907 true, /* header_present */
2908 0, /* last_render_target: ignored */
2909 send_commit_msg, /* response_length */
2910 0, /* end_of_thread */
2911 send_commit_msg); /* send_commit_msg */
2912 }
2913
2914 static unsigned
2915 brw_surface_payload_size(struct brw_codegen *p,
2916 unsigned num_channels,
2917 bool has_simd4x2,
2918 bool has_simd16)
2919 {
2920 if (has_simd4x2 &&
2921 brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2922 return 1;
2923 else if (has_simd16 &&
2924 brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2925 return 2 * num_channels;
2926 else
2927 return num_channels;
2928 }
2929
2930 static void
2931 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2932 brw_inst *insn,
2933 unsigned atomic_op,
2934 bool response_expected)
2935 {
2936 const struct gen_device_info *devinfo = p->devinfo;
2937 unsigned msg_control =
2938 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2939 (response_expected ? 1 << 5 : 0); /* Return data expected */
2940
2941 if (devinfo->gen >= 8 || devinfo->is_haswell) {
2942 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2943 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2944 msg_control |= 1 << 4; /* SIMD8 mode */
2945
2946 brw_inst_set_dp_msg_type(devinfo, insn,
2947 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2948 } else {
2949 brw_inst_set_dp_msg_type(devinfo, insn,
2950 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2951 }
2952 } else {
2953 brw_inst_set_dp_msg_type(devinfo, insn,
2954 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2955
2956 if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2957 msg_control |= 1 << 4; /* SIMD8 mode */
2958 }
2959
2960 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2961 }
2962
2963 void
2964 brw_untyped_atomic(struct brw_codegen *p,
2965 struct brw_reg dst,
2966 struct brw_reg payload,
2967 struct brw_reg surface,
2968 unsigned atomic_op,
2969 unsigned msg_length,
2970 bool response_expected)
2971 {
2972 const struct gen_device_info *devinfo = p->devinfo;
2973 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2974 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2975 GEN7_SFID_DATAPORT_DATA_CACHE);
2976 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2977 /* Mask out unused components -- This is especially important in Align16
2978 * mode on generations that don't have native support for SIMD4x2 atomics,
2979 * because unused but enabled components will cause the dataport to perform
2980 * additional atomic operations on the addresses that happen to be in the
2981 * uninitialized Y, Z and W coordinates of the payload.
2982 */
2983 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2984 struct brw_inst *insn = brw_send_indirect_surface_message(
2985 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2986 brw_surface_payload_size(p, response_expected,
2987 devinfo->gen >= 8 || devinfo->is_haswell, true),
2988 align1);
2989
2990 brw_set_dp_untyped_atomic_message(
2991 p, insn, atomic_op, response_expected);
2992 }
2993
2994 static void
2995 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2996 struct brw_inst *insn,
2997 unsigned num_channels)
2998 {
2999 const struct gen_device_info *devinfo = p->devinfo;
3000 /* Set mask of 32-bit channels to drop. */
3001 unsigned msg_control = 0xf & (0xf << num_channels);
3002
3003 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3004 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3005 msg_control |= 1 << 4; /* SIMD16 mode */
3006 else
3007 msg_control |= 2 << 4; /* SIMD8 mode */
3008 }
3009
3010 brw_inst_set_dp_msg_type(devinfo, insn,
3011 (devinfo->gen >= 8 || devinfo->is_haswell ?
3012 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
3013 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
3014 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3015 }
3016
3017 void
3018 brw_untyped_surface_read(struct brw_codegen *p,
3019 struct brw_reg dst,
3020 struct brw_reg payload,
3021 struct brw_reg surface,
3022 unsigned msg_length,
3023 unsigned num_channels)
3024 {
3025 const struct gen_device_info *devinfo = p->devinfo;
3026 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3027 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3028 GEN7_SFID_DATAPORT_DATA_CACHE);
3029 struct brw_inst *insn = brw_send_indirect_surface_message(
3030 p, sfid, dst, payload, surface, msg_length,
3031 brw_surface_payload_size(p, num_channels, true, true),
3032 false);
3033
3034 brw_set_dp_untyped_surface_read_message(
3035 p, insn, num_channels);
3036 }
3037
3038 static void
3039 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3040 struct brw_inst *insn,
3041 unsigned num_channels)
3042 {
3043 const struct gen_device_info *devinfo = p->devinfo;
3044 /* Set mask of 32-bit channels to drop. */
3045 unsigned msg_control = 0xf & (0xf << num_channels);
3046
3047 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3048 if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3049 msg_control |= 1 << 4; /* SIMD16 mode */
3050 else
3051 msg_control |= 2 << 4; /* SIMD8 mode */
3052 } else {
3053 if (devinfo->gen >= 8 || devinfo->is_haswell)
3054 msg_control |= 0 << 4; /* SIMD4x2 mode */
3055 else
3056 msg_control |= 2 << 4; /* SIMD8 mode */
3057 }
3058
3059 brw_inst_set_dp_msg_type(devinfo, insn,
3060 devinfo->gen >= 8 || devinfo->is_haswell ?
3061 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3062 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3063 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3064 }
3065
3066 void
3067 brw_untyped_surface_write(struct brw_codegen *p,
3068 struct brw_reg payload,
3069 struct brw_reg surface,
3070 unsigned msg_length,
3071 unsigned num_channels)
3072 {
3073 const struct gen_device_info *devinfo = p->devinfo;
3074 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3075 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3076 GEN7_SFID_DATAPORT_DATA_CACHE);
3077 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3078 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3079 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3080 WRITEMASK_X : WRITEMASK_XYZW;
3081 struct brw_inst *insn = brw_send_indirect_surface_message(
3082 p, sfid, brw_writemask(brw_null_reg(), mask),
3083 payload, surface, msg_length, 0, align1);
3084
3085 brw_set_dp_untyped_surface_write_message(
3086 p, insn, num_channels);
3087 }
3088
3089 static void
3090 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3091 struct brw_inst *insn,
3092 unsigned atomic_op,
3093 bool response_expected)
3094 {
3095 const struct gen_device_info *devinfo = p->devinfo;
3096 unsigned msg_control =
3097 atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3098 (response_expected ? 1 << 5 : 0); /* Return data expected */
3099
3100 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3101 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3102 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3103 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3104
3105 brw_inst_set_dp_msg_type(devinfo, insn,
3106 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3107 } else {
3108 brw_inst_set_dp_msg_type(devinfo, insn,
3109 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3110 }
3111
3112 } else {
3113 brw_inst_set_dp_msg_type(devinfo, insn,
3114 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3115
3116 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3117 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3118 }
3119
3120 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3121 }
3122
3123 void
3124 brw_typed_atomic(struct brw_codegen *p,
3125 struct brw_reg dst,
3126 struct brw_reg payload,
3127 struct brw_reg surface,
3128 unsigned atomic_op,
3129 unsigned msg_length,
3130 bool response_expected) {
3131 const struct gen_device_info *devinfo = p->devinfo;
3132 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3133 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3134 GEN6_SFID_DATAPORT_RENDER_CACHE);
3135 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3136 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3137 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3138 struct brw_inst *insn = brw_send_indirect_surface_message(
3139 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3140 brw_surface_payload_size(p, response_expected,
3141 devinfo->gen >= 8 || devinfo->is_haswell, false),
3142 true);
3143
3144 brw_set_dp_typed_atomic_message(
3145 p, insn, atomic_op, response_expected);
3146 }
3147
3148 static void
3149 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3150 struct brw_inst *insn,
3151 unsigned num_channels)
3152 {
3153 const struct gen_device_info *devinfo = p->devinfo;
3154 /* Set mask of unused channels. */
3155 unsigned msg_control = 0xf & (0xf << num_channels);
3156
3157 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3158 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3159 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3160 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3161 else
3162 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3163 }
3164
3165 brw_inst_set_dp_msg_type(devinfo, insn,
3166 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3167 } else {
3168 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3169 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3170 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3171 }
3172
3173 brw_inst_set_dp_msg_type(devinfo, insn,
3174 GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3175 }
3176
3177 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3178 }
3179
3180 void
3181 brw_typed_surface_read(struct brw_codegen *p,
3182 struct brw_reg dst,
3183 struct brw_reg payload,
3184 struct brw_reg surface,
3185 unsigned msg_length,
3186 unsigned num_channels)
3187 {
3188 const struct gen_device_info *devinfo = p->devinfo;
3189 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3190 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3191 GEN6_SFID_DATAPORT_RENDER_CACHE);
3192 struct brw_inst *insn = brw_send_indirect_surface_message(
3193 p, sfid, dst, payload, surface, msg_length,
3194 brw_surface_payload_size(p, num_channels,
3195 devinfo->gen >= 8 || devinfo->is_haswell, false),
3196 true);
3197
3198 brw_set_dp_typed_surface_read_message(
3199 p, insn, num_channels);
3200 }
3201
3202 static void
3203 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3204 struct brw_inst *insn,
3205 unsigned num_channels)
3206 {
3207 const struct gen_device_info *devinfo = p->devinfo;
3208 /* Set mask of unused channels. */
3209 unsigned msg_control = 0xf & (0xf << num_channels);
3210
3211 if (devinfo->gen >= 8 || devinfo->is_haswell) {
3212 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3213 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3214 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3215 else
3216 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3217 }
3218
3219 brw_inst_set_dp_msg_type(devinfo, insn,
3220 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3221
3222 } else {
3223 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3224 if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3225 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3226 }
3227
3228 brw_inst_set_dp_msg_type(devinfo, insn,
3229 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3230 }
3231
3232 brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3233 }
3234
3235 void
3236 brw_typed_surface_write(struct brw_codegen *p,
3237 struct brw_reg payload,
3238 struct brw_reg surface,
3239 unsigned msg_length,
3240 unsigned num_channels)
3241 {
3242 const struct gen_device_info *devinfo = p->devinfo;
3243 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3244 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3245 GEN6_SFID_DATAPORT_RENDER_CACHE);
3246 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3247 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3248 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3249 WRITEMASK_X : WRITEMASK_XYZW);
3250 struct brw_inst *insn = brw_send_indirect_surface_message(
3251 p, sfid, brw_writemask(brw_null_reg(), mask),
3252 payload, surface, msg_length, 0, true);
3253
3254 brw_set_dp_typed_surface_write_message(
3255 p, insn, num_channels);
3256 }
3257
3258 static void
3259 brw_set_memory_fence_message(struct brw_codegen *p,
3260 struct brw_inst *insn,
3261 enum brw_message_target sfid,
3262 bool commit_enable)
3263 {
3264 const struct gen_device_info *devinfo = p->devinfo;
3265
3266 brw_set_message_descriptor(p, insn, sfid,
3267 1 /* message length */,
3268 (commit_enable ? 1 : 0) /* response length */,
3269 true /* header present */,
3270 false);
3271
3272 switch (sfid) {
3273 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3274 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3275 break;
3276 case GEN7_SFID_DATAPORT_DATA_CACHE:
3277 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3278 break;
3279 default:
3280 unreachable("Not reached");
3281 }
3282
3283 if (commit_enable)
3284 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3285 }
3286
3287 void
3288 brw_memory_fence(struct brw_codegen *p,
3289 struct brw_reg dst)
3290 {
3291 const struct gen_device_info *devinfo = p->devinfo;
3292 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3293 struct brw_inst *insn;
3294
3295 brw_push_insn_state(p);
3296 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3297 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3298 dst = vec1(dst);
3299
3300 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3301 * message doesn't write anything back.
3302 */
3303 insn = next_insn(p, BRW_OPCODE_SEND);
3304 dst = retype(dst, BRW_REGISTER_TYPE_UW);
3305 brw_set_dest(p, insn, dst);
3306 brw_set_src0(p, insn, dst);
3307 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3308 commit_enable);
3309
3310 if (devinfo->gen == 7 && !devinfo->is_haswell) {
3311 /* IVB does typed surface access through the render cache, so we need to
3312 * flush it too. Use a different register so both flushes can be
3313 * pipelined by the hardware.
3314 */
3315 insn = next_insn(p, BRW_OPCODE_SEND);
3316 brw_set_dest(p, insn, offset(dst, 1));
3317 brw_set_src0(p, insn, offset(dst, 1));
3318 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3319 commit_enable);
3320
3321 /* Now write the response of the second message into the response of the
3322 * first to trigger a pipeline stall -- This way future render and data
3323 * cache messages will be properly ordered with respect to past data and
3324 * render cache messages.
3325 */
3326 brw_MOV(p, dst, offset(dst, 1));
3327 }
3328
3329 brw_pop_insn_state(p);
3330 }
3331
3332 void
3333 brw_pixel_interpolator_query(struct brw_codegen *p,
3334 struct brw_reg dest,
3335 struct brw_reg mrf,
3336 bool noperspective,
3337 unsigned mode,
3338 struct brw_reg data,
3339 unsigned msg_length,
3340 unsigned response_length)
3341 {
3342 const struct gen_device_info *devinfo = p->devinfo;
3343 struct brw_inst *insn;
3344 const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3345
3346 /* brw_send_indirect_message will automatically use a direct send message
3347 * if data is actually immediate.
3348 */
3349 insn = brw_send_indirect_message(p,
3350 GEN7_SFID_PIXEL_INTERPOLATOR,
3351 dest,
3352 mrf,
3353 vec1(data));
3354 brw_inst_set_mlen(devinfo, insn, msg_length);
3355 brw_inst_set_rlen(devinfo, insn, response_length);
3356
3357 brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3358 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3359 brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3360 brw_inst_set_pi_message_type(devinfo, insn, mode);
3361 }
3362
3363 void
3364 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst)
3365 {
3366 const struct gen_device_info *devinfo = p->devinfo;
3367 const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3368 const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3369 brw_inst *inst;
3370
3371 assert(devinfo->gen >= 7);
3372
3373 brw_push_insn_state(p);
3374
3375 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3376 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3377
3378 if (devinfo->gen >= 8) {
3379 /* Getting the first active channel index is easy on Gen8: Just find
3380 * the first bit set in the mask register. The same register exists
3381 * on HSW already but it reads back as all ones when the current
3382 * instruction has execution masking disabled, so it's kind of
3383 * useless.
3384 */
3385 inst = brw_FBL(p, vec1(dst),
3386 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
3387
3388 /* Quarter control has the effect of magically shifting the value of
3389 * this register so you'll get the first active channel relative to
3390 * the specified quarter control as result.
3391 */
3392 } else {
3393 const struct brw_reg flag = brw_flag_reg(1, 0);
3394
3395 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3396
3397 /* Run enough instructions returning zero with execution masking and
3398 * a conditional modifier enabled in order to get the full execution
3399 * mask in f1.0. We could use a single 32-wide move here if it
3400 * weren't because of the hardware bug that causes channel enables to
3401 * be applied incorrectly to the second half of 32-wide instructions
3402 * on Gen7.
3403 */
3404 const unsigned lower_size = MIN2(16, exec_size);
3405 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3406 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3407 brw_imm_uw(0));
3408 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3409 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3410 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3411 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3412 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3413 }
3414
3415 /* Find the first bit set in the exec_size-wide portion of the flag
3416 * register that was updated by the last sequence of MOV
3417 * instructions.
3418 */
3419 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3420 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3421 }
3422 } else {
3423 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3424
3425 if (devinfo->gen >= 8) {
3426 /* In SIMD4x2 mode the first active channel index is just the
3427 * negation of the first bit of the mask register.
3428 */
3429 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3430 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3431 brw_imm_ud(1));
3432
3433 } else {
3434 /* Overwrite the destination without and with execution masking to
3435 * find out which of the channels is active.
3436 */
3437 brw_push_insn_state(p);
3438 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3439 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3440 brw_imm_ud(1));
3441
3442 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3443 brw_imm_ud(0));
3444 brw_pop_insn_state(p);
3445 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3446 }
3447 }
3448
3449 brw_pop_insn_state(p);
3450 }
3451
3452 void
3453 brw_broadcast(struct brw_codegen *p,
3454 struct brw_reg dst,
3455 struct brw_reg src,
3456 struct brw_reg idx)
3457 {
3458 const struct gen_device_info *devinfo = p->devinfo;
3459 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3460 brw_inst *inst;
3461
3462 brw_push_insn_state(p);
3463 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3464 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3465
3466 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3467 src.address_mode == BRW_ADDRESS_DIRECT);
3468
3469 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3470 idx.file == BRW_IMMEDIATE_VALUE) {
3471 /* Trivial, the source is already uniform or the index is a constant.
3472 * We will typically not get here if the optimizer is doing its job, but
3473 * asserting would be mean.
3474 */
3475 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3476 brw_MOV(p, dst,
3477 (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3478 stride(suboffset(src, 4 * i), 0, 4, 1)));
3479 } else {
3480 if (align1) {
3481 const struct brw_reg addr =
3482 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3483 const unsigned offset = src.nr * REG_SIZE + src.subnr;
3484 /* Limit in bytes of the signed indirect addressing immediate. */
3485 const unsigned limit = 512;
3486
3487 brw_push_insn_state(p);
3488 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3489 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3490
3491 /* Take into account the component size and horizontal stride. */
3492 assert(src.vstride == src.hstride + src.width);
3493 brw_SHL(p, addr, vec1(idx),
3494 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3495 src.hstride - 1));
3496
3497 /* We can only address up to limit bytes using the indirect
3498 * addressing immediate, account for the difference if the source
3499 * register is above this limit.
3500 */
3501 if (offset >= limit)
3502 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3503
3504 brw_pop_insn_state(p);
3505
3506 /* Use indirect addressing to fetch the specified component. */
3507 brw_MOV(p, dst,
3508 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3509 src.type));
3510 } else {
3511 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3512 * to all bits of a flag register,
3513 */
3514 inst = brw_MOV(p,
3515 brw_null_reg(),
3516 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3517 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3518 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3519 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3520
3521 /* and use predicated SEL to pick the right channel. */
3522 inst = brw_SEL(p, dst,
3523 stride(suboffset(src, 4), 4, 4, 1),
3524 stride(src, 4, 4, 1));
3525 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3526 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3527 }
3528 }
3529
3530 brw_pop_insn_state(p);
3531 }
3532
3533 /**
3534 * This instruction is generated as a single-channel align1 instruction by
3535 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3536 *
3537 * We can't use the typed atomic op in the FS because that has the execution
3538 * mask ANDed with the pixel mask, but we just want to write the one dword for
3539 * all the pixels.
3540 *
3541 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3542 * one u32. So we use the same untyped atomic write message as the pixel
3543 * shader.
3544 *
3545 * The untyped atomic operation requires a BUFFER surface type with RAW
3546 * format, and is only accessible through the legacy DATA_CACHE dataport
3547 * messages.
3548 */
3549 void brw_shader_time_add(struct brw_codegen *p,
3550 struct brw_reg payload,
3551 uint32_t surf_index)
3552 {
3553 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3554 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3555 GEN7_SFID_DATAPORT_DATA_CACHE);
3556 assert(p->devinfo->gen >= 7);
3557
3558 brw_push_insn_state(p);
3559 brw_set_default_access_mode(p, BRW_ALIGN_1);
3560 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3561 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3562 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3563
3564 /* We use brw_vec1_reg and unmasked because we want to increment the given
3565 * offset only once.
3566 */
3567 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3568 BRW_ARF_NULL, 0));
3569 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3570 payload.nr, 0));
3571 brw_set_src1(p, send, brw_imm_ud(0));
3572 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3573 brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3574 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3575
3576 brw_pop_insn_state(p);
3577 }
3578
3579
3580 /**
3581 * Emit the SEND message for a barrier
3582 */
3583 void
3584 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3585 {
3586 const struct gen_device_info *devinfo = p->devinfo;
3587 struct brw_inst *inst;
3588
3589 assert(devinfo->gen >= 7);
3590
3591 inst = next_insn(p, BRW_OPCODE_SEND);
3592 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3593 brw_set_src0(p, inst, src);
3594 brw_set_src1(p, inst, brw_null_reg());
3595
3596 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3597 1 /* msg_length */,
3598 0 /* response_length */,
3599 false /* header_present */,
3600 false /* end_of_thread */);
3601
3602 brw_inst_set_gateway_notify(devinfo, inst, 1);
3603 brw_inst_set_gateway_subfuncid(devinfo, inst,
3604 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3605
3606 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3607 }
3608
3609
3610 /**
3611 * Emit the wait instruction for a barrier
3612 */
3613 void
3614 brw_WAIT(struct brw_codegen *p)
3615 {
3616 const struct gen_device_info *devinfo = p->devinfo;
3617 struct brw_inst *insn;
3618
3619 struct brw_reg src = brw_notification_reg();
3620
3621 insn = next_insn(p, BRW_OPCODE_WAIT);
3622 brw_set_dest(p, insn, src);
3623 brw_set_src0(p, insn, src);
3624 brw_set_src1(p, insn, brw_null_reg());
3625
3626 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3627 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3628 }