intel/compiler: store the FS inputs in WM prog data
[mesa.git] / src / intel / compiler / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30
31 namespace brw {
32 /**
33 * Toolbox to assemble an FS IR program out of individual instructions.
34 *
35 * This object is meant to have an interface consistent with
36 * brw::vec4_builder. They cannot be fully interchangeable because
37 * brw::fs_builder generates scalar code while brw::vec4_builder generates
38 * vector code.
39 */
40 class fs_builder {
41 public:
42 /** Type used in this IR to represent a source of an instruction. */
43 typedef fs_reg src_reg;
44
45 /** Type used in this IR to represent the destination of an instruction. */
46 typedef fs_reg dst_reg;
47
48 /** Type used in this IR to represent an instruction. */
49 typedef fs_inst instruction;
50
51 /**
52 * Construct an fs_builder that inserts instructions into \p shader.
53 * \p dispatch_width gives the native execution width of the program.
54 */
55 fs_builder(backend_shader *shader,
56 unsigned dispatch_width) :
57 shader(shader), block(NULL), cursor(NULL),
58 _dispatch_width(dispatch_width),
59 _group(0),
60 force_writemask_all(false),
61 annotation()
62 {
63 }
64
65 /**
66 * Construct an fs_builder that inserts instructions into \p shader
67 * before instruction \p inst in basic block \p block. The default
68 * execution controls and debug annotation are initialized from the
69 * instruction passed as argument.
70 */
71 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72 shader(shader), block(block), cursor(inst),
73 _dispatch_width(inst->exec_size),
74 _group(inst->group),
75 force_writemask_all(inst->force_writemask_all)
76 {
77 annotation.str = inst->annotation;
78 annotation.ir = inst->ir;
79 }
80
81 /**
82 * Construct an fs_builder that inserts instructions before \p cursor in
83 * basic block \p block, inheriting other code generation parameters
84 * from this.
85 */
86 fs_builder
87 at(bblock_t *block, exec_node *cursor) const
88 {
89 fs_builder bld = *this;
90 bld.block = block;
91 bld.cursor = cursor;
92 return bld;
93 }
94
95 /**
96 * Construct an fs_builder appending instructions at the end of the
97 * instruction list of the shader, inheriting other code generation
98 * parameters from this.
99 */
100 fs_builder
101 at_end() const
102 {
103 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104 }
105
106 /**
107 * Construct a builder specifying the default SIMD width and group of
108 * channel enable signals, inheriting other code generation parameters
109 * from this.
110 *
111 * \p n gives the default SIMD width, \p i gives the slot group used for
112 * predication and control flow masking in multiples of \p n channels.
113 */
114 fs_builder
115 group(unsigned n, unsigned i) const
116 {
117 fs_builder bld = *this;
118
119 if (n <= dispatch_width() && i < dispatch_width() / n) {
120 bld._group += i * n;
121 } else {
122 /* The requested channel group isn't a subset of the channel group
123 * of this builder, which means that the resulting instructions
124 * would use (potentially undefined) channel enable signals not
125 * specified by the parent builder. That's only valid if the
126 * instruction doesn't have per-channel semantics, in which case
127 * we should clear off the default group index in order to prevent
128 * emitting instructions with channel group not aligned to their
129 * own execution size.
130 */
131 assert(force_writemask_all);
132 bld._group = 0;
133 }
134
135 bld._dispatch_width = n;
136 return bld;
137 }
138
139 /**
140 * Alias for group() with width equal to eight.
141 */
142 fs_builder
143 half(unsigned i) const
144 {
145 return group(8, i);
146 }
147
148 /**
149 * Construct a builder with per-channel control flow execution masking
150 * disabled if \p b is true. If control flow execution masking is
151 * already disabled this has no effect.
152 */
153 fs_builder
154 exec_all(bool b = true) const
155 {
156 fs_builder bld = *this;
157 if (b)
158 bld.force_writemask_all = true;
159 return bld;
160 }
161
162 /**
163 * Construct a builder with the given debug annotation info.
164 */
165 fs_builder
166 annotate(const char *str, const void *ir = NULL) const
167 {
168 fs_builder bld = *this;
169 bld.annotation.str = str;
170 bld.annotation.ir = ir;
171 return bld;
172 }
173
174 /**
175 * Get the SIMD width in use.
176 */
177 unsigned
178 dispatch_width() const
179 {
180 return _dispatch_width;
181 }
182
183 /**
184 * Get the channel group in use.
185 */
186 unsigned
187 group() const
188 {
189 return _group;
190 }
191
192 /**
193 * Allocate a virtual register of natural vector size (one for this IR)
194 * and SIMD width. \p n gives the amount of space to allocate in
195 * dispatch_width units (which is just enough space for one logical
196 * component in this IR).
197 */
198 dst_reg
199 vgrf(enum brw_reg_type type, unsigned n = 1) const
200 {
201 assert(dispatch_width() <= 32);
202
203 if (n > 0)
204 return dst_reg(VGRF, shader->alloc.allocate(
205 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206 REG_SIZE)),
207 type);
208 else
209 return retype(null_reg_ud(), type);
210 }
211
212 /**
213 * Create a null register of floating type.
214 */
215 dst_reg
216 null_reg_f() const
217 {
218 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219 }
220
221 dst_reg
222 null_reg_df() const
223 {
224 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225 }
226
227 /**
228 * Create a null register of signed integer type.
229 */
230 dst_reg
231 null_reg_d() const
232 {
233 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234 }
235
236 /**
237 * Create a null register of unsigned integer type.
238 */
239 dst_reg
240 null_reg_ud() const
241 {
242 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243 }
244
245 /**
246 * Insert an instruction into the program.
247 */
248 instruction *
249 emit(const instruction &inst) const
250 {
251 return emit(new(shader->mem_ctx) instruction(inst));
252 }
253
254 /**
255 * Create and insert a nullary control instruction into the program.
256 */
257 instruction *
258 emit(enum opcode opcode) const
259 {
260 return emit(instruction(opcode, dispatch_width()));
261 }
262
263 /**
264 * Create and insert a nullary instruction into the program.
265 */
266 instruction *
267 emit(enum opcode opcode, const dst_reg &dst) const
268 {
269 return emit(instruction(opcode, dispatch_width(), dst));
270 }
271
272 /**
273 * Create and insert a unary instruction into the program.
274 */
275 instruction *
276 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
277 {
278 switch (opcode) {
279 case SHADER_OPCODE_RCP:
280 case SHADER_OPCODE_RSQ:
281 case SHADER_OPCODE_SQRT:
282 case SHADER_OPCODE_EXP2:
283 case SHADER_OPCODE_LOG2:
284 case SHADER_OPCODE_SIN:
285 case SHADER_OPCODE_COS:
286 return emit(instruction(opcode, dispatch_width(), dst,
287 fix_math_operand(src0)));
288
289 default:
290 return emit(instruction(opcode, dispatch_width(), dst, src0));
291 }
292 }
293
294 /**
295 * Create and insert a binary instruction into the program.
296 */
297 instruction *
298 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
299 const src_reg &src1) const
300 {
301 switch (opcode) {
302 case SHADER_OPCODE_POW:
303 case SHADER_OPCODE_INT_QUOTIENT:
304 case SHADER_OPCODE_INT_REMAINDER:
305 return emit(instruction(opcode, dispatch_width(), dst,
306 fix_math_operand(src0),
307 fix_math_operand(fix_byte_src(src1))));
308
309 default:
310 return emit(instruction(opcode, dispatch_width(), dst,
311 src0, fix_byte_src(src1)));
312
313 }
314 }
315
316 /**
317 * Create and insert a ternary instruction into the program.
318 */
319 instruction *
320 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
321 const src_reg &src1, const src_reg &src2) const
322 {
323 switch (opcode) {
324 case BRW_OPCODE_BFE:
325 case BRW_OPCODE_BFI2:
326 case BRW_OPCODE_MAD:
327 case BRW_OPCODE_LRP:
328 return emit(instruction(opcode, dispatch_width(), dst,
329 fix_3src_operand(src0),
330 fix_3src_operand(fix_byte_src(src1)),
331 fix_3src_operand(fix_byte_src(src2))));
332
333 default:
334 return emit(instruction(opcode, dispatch_width(), dst,
335 src0, fix_byte_src(src1), fix_byte_src(src2)));
336 }
337 }
338
339 /**
340 * Create and insert an instruction with a variable number of sources
341 * into the program.
342 */
343 instruction *
344 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
345 unsigned n) const
346 {
347 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
348 }
349
350 /**
351 * Insert a preallocated instruction into the program.
352 */
353 instruction *
354 emit(instruction *inst) const
355 {
356 assert(inst->exec_size <= 32);
357 assert(inst->exec_size == dispatch_width() ||
358 force_writemask_all);
359
360 inst->group = _group;
361 inst->force_writemask_all = force_writemask_all;
362 inst->annotation = annotation.str;
363 inst->ir = annotation.ir;
364
365 if (block)
366 static_cast<instruction *>(cursor)->insert_before(block, inst);
367 else
368 cursor->insert_before(inst);
369
370 return inst;
371 }
372
373 /**
374 * Select \p src0 if the comparison of both sources with the given
375 * conditional mod evaluates to true, otherwise select \p src1.
376 *
377 * Generally useful to get the minimum or maximum of two values.
378 */
379 instruction *
380 emit_minmax(const dst_reg &dst, const src_reg &src0,
381 const src_reg &src1, brw_conditional_mod mod) const
382 {
383 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
384
385 /* In some cases we can't have bytes as operand for src1, so use the
386 * same type for both operand.
387 */
388 return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
389 fix_unsigned_negate(fix_byte_src(src1))));
390 }
391
392 /**
393 * Copy any live channel from \p src to the first channel of the result.
394 */
395 src_reg
396 emit_uniformize(const src_reg &src) const
397 {
398 /* FIXME: We use a vector chan_index and dst to allow constant and
399 * copy propagration to move result all the way into the consuming
400 * instruction (typically a surface index or sampler index for a
401 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
402 * dispatch. Once we teach const/copy propagation about scalars we
403 * should go back to scalar destinations here.
404 */
405 const fs_builder ubld = exec_all();
406 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
407 const dst_reg dst = vgrf(src.type);
408
409 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
410 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
411
412 return src_reg(component(dst, 0));
413 }
414
415 src_reg
416 move_to_vgrf(const src_reg &src, unsigned num_components) const
417 {
418 src_reg *const src_comps = new src_reg[num_components];
419 for (unsigned i = 0; i < num_components; i++)
420 src_comps[i] = offset(src, dispatch_width(), i);
421
422 const dst_reg dst = vgrf(src.type, num_components);
423 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
424
425 delete[] src_comps;
426
427 return src_reg(dst);
428 }
429
430 void
431 emit_scan(enum opcode opcode, const dst_reg &tmp,
432 unsigned cluster_size, brw_conditional_mod mod) const
433 {
434 assert(dispatch_width() >= 8);
435
436 /* The instruction splitting code isn't advanced enough to split
437 * these so we need to handle that ourselves.
438 */
439 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
440 const unsigned half_width = dispatch_width() / 2;
441 const fs_builder ubld = exec_all().group(half_width, 0);
442 dst_reg left = tmp;
443 dst_reg right = horiz_offset(tmp, half_width);
444 ubld.emit_scan(opcode, left, cluster_size, mod);
445 ubld.emit_scan(opcode, right, cluster_size, mod);
446 if (cluster_size > half_width) {
447 src_reg left_comp = component(left, half_width - 1);
448 set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
449 }
450 return;
451 }
452
453 if (cluster_size > 1) {
454 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
455 const dst_reg left = horiz_stride(tmp, 2);
456 const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
457 set_condmod(mod, ubld.emit(opcode, right, left, right));
458 }
459
460 if (cluster_size > 2) {
461 if (type_sz(tmp.type) <= 4) {
462 const fs_builder ubld =
463 exec_all().group(dispatch_width() / 4, 0);
464 src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
465
466 dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
467 set_condmod(mod, ubld.emit(opcode, right, left, right));
468
469 right = horiz_stride(horiz_offset(tmp, 3), 4);
470 set_condmod(mod, ubld.emit(opcode, right, left, right));
471 } else {
472 /* For 64-bit types, we have to do things differently because
473 * the code above would land us with destination strides that
474 * the hardware can't handle. Fortunately, we'll only be
475 * 8-wide in that case and it's the same number of
476 * instructions.
477 */
478 const fs_builder ubld = exec_all().group(2, 0);
479
480 for (unsigned i = 0; i < dispatch_width(); i += 4) {
481 src_reg left = component(tmp, i + 1);
482 dst_reg right = horiz_offset(tmp, i + 2);
483 set_condmod(mod, ubld.emit(opcode, right, left, right));
484 }
485 }
486 }
487
488 for (unsigned i = 4;
489 i < MIN2(cluster_size, dispatch_width());
490 i *= 2) {
491 const fs_builder ubld = exec_all().group(i, 0);
492 src_reg left = component(tmp, i - 1);
493 dst_reg right = horiz_offset(tmp, i);
494 set_condmod(mod, ubld.emit(opcode, right, left, right));
495
496 if (dispatch_width() > i * 2) {
497 left = component(tmp, i * 3 - 1);
498 right = horiz_offset(tmp, i * 3);
499 set_condmod(mod, ubld.emit(opcode, right, left, right));
500 }
501
502 if (dispatch_width() > i * 4) {
503 left = component(tmp, i * 5 - 1);
504 right = horiz_offset(tmp, i * 5);
505 set_condmod(mod, ubld.emit(opcode, right, left, right));
506
507 left = component(tmp, i * 7 - 1);
508 right = horiz_offset(tmp, i * 7);
509 set_condmod(mod, ubld.emit(opcode, right, left, right));
510 }
511 }
512 }
513
514 /**
515 * Assorted arithmetic ops.
516 * @{
517 */
518 #define ALU1(op) \
519 instruction * \
520 op(const dst_reg &dst, const src_reg &src0) const \
521 { \
522 return emit(BRW_OPCODE_##op, dst, src0); \
523 }
524
525 #define ALU2(op) \
526 instruction * \
527 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
528 { \
529 return emit(BRW_OPCODE_##op, dst, src0, src1); \
530 }
531
532 #define ALU2_ACC(op) \
533 instruction * \
534 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
535 { \
536 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
537 inst->writes_accumulator = true; \
538 return inst; \
539 }
540
541 #define ALU3(op) \
542 instruction * \
543 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
544 const src_reg &src2) const \
545 { \
546 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
547 }
548
549 ALU2(ADD)
550 ALU2_ACC(ADDC)
551 ALU2(AND)
552 ALU2(ASR)
553 ALU2(AVG)
554 ALU3(BFE)
555 ALU2(BFI1)
556 ALU3(BFI2)
557 ALU1(BFREV)
558 ALU1(CBIT)
559 ALU2(CMPN)
560 ALU1(DIM)
561 ALU2(DP2)
562 ALU2(DP3)
563 ALU2(DP4)
564 ALU2(DPH)
565 ALU1(F16TO32)
566 ALU1(F32TO16)
567 ALU1(FBH)
568 ALU1(FBL)
569 ALU1(FRC)
570 ALU2(LINE)
571 ALU1(LZD)
572 ALU2(MAC)
573 ALU2_ACC(MACH)
574 ALU3(MAD)
575 ALU1(MOV)
576 ALU2(MUL)
577 ALU1(NOT)
578 ALU2(OR)
579 ALU2(PLN)
580 ALU1(RNDD)
581 ALU1(RNDE)
582 ALU1(RNDU)
583 ALU1(RNDZ)
584 ALU2(ROL)
585 ALU2(ROR)
586 ALU2(SAD2)
587 ALU2_ACC(SADA2)
588 ALU2(SEL)
589 ALU2(SHL)
590 ALU2(SHR)
591 ALU2_ACC(SUBB)
592 ALU2(XOR)
593
594 #undef ALU3
595 #undef ALU2_ACC
596 #undef ALU2
597 #undef ALU1
598 /** @} */
599
600 /**
601 * CMP: Sets the low bit of the destination channels with the result
602 * of the comparison, while the upper bits are undefined, and updates
603 * the flag register with the packed 16 bits of the result.
604 */
605 instruction *
606 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
607 brw_conditional_mod condition) const
608 {
609 /* Take the instruction:
610 *
611 * CMP null<d> src0<f> src1<f>
612 *
613 * Original gen4 does type conversion to the destination type
614 * before comparison, producing garbage results for floating
615 * point comparisons.
616 *
617 * The destination type doesn't matter on newer generations,
618 * so we set the type to match src0 so we can compact the
619 * instruction.
620 */
621 return set_condmod(condition,
622 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
623 fix_unsigned_negate(src0),
624 fix_unsigned_negate(src1)));
625 }
626
627 /**
628 * Gen4 predicated IF.
629 */
630 instruction *
631 IF(brw_predicate predicate) const
632 {
633 return set_predicate(predicate, emit(BRW_OPCODE_IF));
634 }
635
636 /**
637 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
638 */
639 instruction *
640 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
641 const src_reg &src2, brw_conditional_mod condition) const
642 {
643 /* CSEL only operates on floats, so we can't do integer </<=/>=/>
644 * comparisons. Zero/non-zero (== and !=) comparisons almost work.
645 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
646 */
647 assert(src2.type == BRW_REGISTER_TYPE_F);
648
649 return set_condmod(condition,
650 emit(BRW_OPCODE_CSEL,
651 retype(dst, BRW_REGISTER_TYPE_F),
652 retype(src0, BRW_REGISTER_TYPE_F),
653 retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
654 fix_byte_src(src2)));
655 }
656
657 /**
658 * Emit a linear interpolation instruction.
659 */
660 instruction *
661 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
662 const src_reg &a) const
663 {
664 if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
665 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
666 * we need to reorder the operands.
667 */
668 return emit(BRW_OPCODE_LRP, dst, a, y, x);
669
670 } else {
671 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
672 const dst_reg y_times_a = vgrf(dst.type);
673 const dst_reg one_minus_a = vgrf(dst.type);
674 const dst_reg x_times_one_minus_a = vgrf(dst.type);
675
676 MUL(y_times_a, y, a);
677 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
678 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
679 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
680 }
681 }
682
683 /**
684 * Collect a number of registers in a contiguous range of registers.
685 */
686 instruction *
687 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
688 unsigned sources, unsigned header_size) const
689 {
690 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
691 inst->header_size = header_size;
692 inst->size_written = header_size * REG_SIZE;
693 for (unsigned i = header_size; i < sources; i++) {
694 inst->size_written +=
695 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
696 REG_SIZE);
697 }
698
699 return inst;
700 }
701
702 instruction *
703 UNDEF(const dst_reg &dst) const
704 {
705 assert(dst.file == VGRF);
706 instruction *inst = emit(SHADER_OPCODE_UNDEF,
707 retype(dst, BRW_REGISTER_TYPE_UD));
708 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
709
710 return inst;
711 }
712
713 backend_shader *shader;
714
715 /**
716 * Byte sized operands are not supported for src1 on Gen11+.
717 */
718 src_reg
719 fix_byte_src(const src_reg &src) const
720 {
721 if (shader->devinfo->gen < 11 || type_sz(src.type) != 1)
722 return src;
723
724 dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
725 BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
726 MOV(temp, src);
727 return src_reg(temp);
728 }
729
730 private:
731 /**
732 * Workaround for negation of UD registers. See comment in
733 * fs_generator::generate_code() for more details.
734 */
735 src_reg
736 fix_unsigned_negate(const src_reg &src) const
737 {
738 if (src.type == BRW_REGISTER_TYPE_UD &&
739 src.negate) {
740 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
741 MOV(temp, src);
742 return src_reg(temp);
743 } else {
744 return src;
745 }
746 }
747
748 /**
749 * Workaround for source register modes not supported by the ternary
750 * instruction encoding.
751 */
752 src_reg
753 fix_3src_operand(const src_reg &src) const
754 {
755 switch (src.file) {
756 case FIXED_GRF:
757 /* FINISHME: Could handle scalar region, other stride=1 regions */
758 if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
759 src.width != BRW_WIDTH_8 ||
760 src.hstride != BRW_HORIZONTAL_STRIDE_1)
761 break;
762 /* fallthrough */
763 case ATTR:
764 case VGRF:
765 case UNIFORM:
766 case IMM:
767 return src;
768 default:
769 break;
770 }
771
772 dst_reg expanded = vgrf(src.type);
773 MOV(expanded, src);
774 return expanded;
775 }
776
777 /**
778 * Workaround for source register modes not supported by the math
779 * instruction.
780 */
781 src_reg
782 fix_math_operand(const src_reg &src) const
783 {
784 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
785 * might be able to do better by doing execsize = 1 math and then
786 * expanding that result out, but we would need to be careful with
787 * masking.
788 *
789 * Gen6 hardware ignores source modifiers (negate and abs) on math
790 * instructions, so we also move to a temp to set those up.
791 *
792 * Gen7 relaxes most of the above restrictions, but still can't use IMM
793 * operands to math
794 */
795 if ((shader->devinfo->gen == 6 &&
796 (src.file == IMM || src.file == UNIFORM ||
797 src.abs || src.negate)) ||
798 (shader->devinfo->gen == 7 && src.file == IMM)) {
799 const dst_reg tmp = vgrf(src.type);
800 MOV(tmp, src);
801 return tmp;
802 } else {
803 return src;
804 }
805 }
806
807 bblock_t *block;
808 exec_node *cursor;
809
810 unsigned _dispatch_width;
811 unsigned _group;
812 bool force_writemask_all;
813
814 /** Debug annotation info. */
815 struct {
816 const char *str;
817 const void *ir;
818 } annotation;
819 };
820 }
821
822 #endif