intel/nir: Stop using nir_lower_vars_to_scratch
[mesa.git] / src / intel / compiler / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30
31 namespace brw {
32 /**
33 * Toolbox to assemble an FS IR program out of individual instructions.
34 *
35 * This object is meant to have an interface consistent with
36 * brw::vec4_builder. They cannot be fully interchangeable because
37 * brw::fs_builder generates scalar code while brw::vec4_builder generates
38 * vector code.
39 */
40 class fs_builder {
41 public:
42 /** Type used in this IR to represent a source of an instruction. */
43 typedef fs_reg src_reg;
44
45 /** Type used in this IR to represent the destination of an instruction. */
46 typedef fs_reg dst_reg;
47
48 /** Type used in this IR to represent an instruction. */
49 typedef fs_inst instruction;
50
51 /**
52 * Construct an fs_builder that inserts instructions into \p shader.
53 * \p dispatch_width gives the native execution width of the program.
54 */
55 fs_builder(backend_shader *shader,
56 unsigned dispatch_width) :
57 shader(shader), block(NULL), cursor(NULL),
58 _dispatch_width(dispatch_width),
59 _group(0),
60 force_writemask_all(false),
61 annotation()
62 {
63 }
64
65 /**
66 * Construct an fs_builder that inserts instructions into \p shader
67 * before instruction \p inst in basic block \p block. The default
68 * execution controls and debug annotation are initialized from the
69 * instruction passed as argument.
70 */
71 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72 shader(shader), block(block), cursor(inst),
73 _dispatch_width(inst->exec_size),
74 _group(inst->group),
75 force_writemask_all(inst->force_writemask_all)
76 {
77 annotation.str = inst->annotation;
78 annotation.ir = inst->ir;
79 }
80
81 /**
82 * Construct an fs_builder that inserts instructions before \p cursor in
83 * basic block \p block, inheriting other code generation parameters
84 * from this.
85 */
86 fs_builder
87 at(bblock_t *block, exec_node *cursor) const
88 {
89 fs_builder bld = *this;
90 bld.block = block;
91 bld.cursor = cursor;
92 return bld;
93 }
94
95 /**
96 * Construct an fs_builder appending instructions at the end of the
97 * instruction list of the shader, inheriting other code generation
98 * parameters from this.
99 */
100 fs_builder
101 at_end() const
102 {
103 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104 }
105
106 /**
107 * Construct a builder specifying the default SIMD width and group of
108 * channel enable signals, inheriting other code generation parameters
109 * from this.
110 *
111 * \p n gives the default SIMD width, \p i gives the slot group used for
112 * predication and control flow masking in multiples of \p n channels.
113 */
114 fs_builder
115 group(unsigned n, unsigned i) const
116 {
117 fs_builder bld = *this;
118
119 if (n <= dispatch_width() && i < dispatch_width() / n) {
120 bld._group += i * n;
121 } else {
122 /* The requested channel group isn't a subset of the channel group
123 * of this builder, which means that the resulting instructions
124 * would use (potentially undefined) channel enable signals not
125 * specified by the parent builder. That's only valid if the
126 * instruction doesn't have per-channel semantics, in which case
127 * we should clear off the default group index in order to prevent
128 * emitting instructions with channel group not aligned to their
129 * own execution size.
130 */
131 assert(force_writemask_all);
132 bld._group = 0;
133 }
134
135 bld._dispatch_width = n;
136 return bld;
137 }
138
139 /**
140 * Alias for group() with width equal to eight.
141 */
142 fs_builder
143 quarter(unsigned i) const
144 {
145 return group(8, i);
146 }
147
148 /**
149 * Construct a builder with per-channel control flow execution masking
150 * disabled if \p b is true. If control flow execution masking is
151 * already disabled this has no effect.
152 */
153 fs_builder
154 exec_all(bool b = true) const
155 {
156 fs_builder bld = *this;
157 if (b)
158 bld.force_writemask_all = true;
159 return bld;
160 }
161
162 /**
163 * Construct a builder with the given debug annotation info.
164 */
165 fs_builder
166 annotate(const char *str, const void *ir = NULL) const
167 {
168 fs_builder bld = *this;
169 bld.annotation.str = str;
170 bld.annotation.ir = ir;
171 return bld;
172 }
173
174 /**
175 * Get the SIMD width in use.
176 */
177 unsigned
178 dispatch_width() const
179 {
180 return _dispatch_width;
181 }
182
183 /**
184 * Get the channel group in use.
185 */
186 unsigned
187 group() const
188 {
189 return _group;
190 }
191
192 /**
193 * Allocate a virtual register of natural vector size (one for this IR)
194 * and SIMD width. \p n gives the amount of space to allocate in
195 * dispatch_width units (which is just enough space for one logical
196 * component in this IR).
197 */
198 dst_reg
199 vgrf(enum brw_reg_type type, unsigned n = 1) const
200 {
201 assert(dispatch_width() <= 32);
202
203 if (n > 0)
204 return dst_reg(VGRF, shader->alloc.allocate(
205 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206 REG_SIZE)),
207 type);
208 else
209 return retype(null_reg_ud(), type);
210 }
211
212 /**
213 * Create a null register of floating type.
214 */
215 dst_reg
216 null_reg_f() const
217 {
218 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219 }
220
221 dst_reg
222 null_reg_df() const
223 {
224 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225 }
226
227 /**
228 * Create a null register of signed integer type.
229 */
230 dst_reg
231 null_reg_d() const
232 {
233 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234 }
235
236 /**
237 * Create a null register of unsigned integer type.
238 */
239 dst_reg
240 null_reg_ud() const
241 {
242 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243 }
244
245 /**
246 * Insert an instruction into the program.
247 */
248 instruction *
249 emit(const instruction &inst) const
250 {
251 return emit(new(shader->mem_ctx) instruction(inst));
252 }
253
254 /**
255 * Create and insert a nullary control instruction into the program.
256 */
257 instruction *
258 emit(enum opcode opcode) const
259 {
260 return emit(instruction(opcode, dispatch_width()));
261 }
262
263 /**
264 * Create and insert a nullary instruction into the program.
265 */
266 instruction *
267 emit(enum opcode opcode, const dst_reg &dst) const
268 {
269 return emit(instruction(opcode, dispatch_width(), dst));
270 }
271
272 /**
273 * Create and insert a unary instruction into the program.
274 */
275 instruction *
276 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
277 {
278 switch (opcode) {
279 case SHADER_OPCODE_RCP:
280 case SHADER_OPCODE_RSQ:
281 case SHADER_OPCODE_SQRT:
282 case SHADER_OPCODE_EXP2:
283 case SHADER_OPCODE_LOG2:
284 case SHADER_OPCODE_SIN:
285 case SHADER_OPCODE_COS:
286 return emit(instruction(opcode, dispatch_width(), dst,
287 fix_math_operand(src0)));
288
289 default:
290 return emit(instruction(opcode, dispatch_width(), dst, src0));
291 }
292 }
293
294 /**
295 * Create and insert a binary instruction into the program.
296 */
297 instruction *
298 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
299 const src_reg &src1) const
300 {
301 switch (opcode) {
302 case SHADER_OPCODE_POW:
303 case SHADER_OPCODE_INT_QUOTIENT:
304 case SHADER_OPCODE_INT_REMAINDER:
305 return emit(instruction(opcode, dispatch_width(), dst,
306 fix_math_operand(src0),
307 fix_math_operand(fix_byte_src(src1))));
308
309 default:
310 return emit(instruction(opcode, dispatch_width(), dst,
311 src0, fix_byte_src(src1)));
312
313 }
314 }
315
316 /**
317 * Create and insert a ternary instruction into the program.
318 */
319 instruction *
320 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
321 const src_reg &src1, const src_reg &src2) const
322 {
323 switch (opcode) {
324 case BRW_OPCODE_BFE:
325 case BRW_OPCODE_BFI2:
326 case BRW_OPCODE_MAD:
327 case BRW_OPCODE_LRP:
328 return emit(instruction(opcode, dispatch_width(), dst,
329 fix_3src_operand(src0),
330 fix_3src_operand(fix_byte_src(src1)),
331 fix_3src_operand(fix_byte_src(src2))));
332
333 default:
334 return emit(instruction(opcode, dispatch_width(), dst,
335 src0, fix_byte_src(src1), fix_byte_src(src2)));
336 }
337 }
338
339 /**
340 * Create and insert an instruction with a variable number of sources
341 * into the program.
342 */
343 instruction *
344 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
345 unsigned n) const
346 {
347 /* Use the emit() methods for specific operand counts to ensure that
348 * opcode-specific operand fixups occur.
349 */
350 if (n == 2) {
351 return emit(opcode, dst, srcs[0], srcs[1]);
352 } else if (n == 3) {
353 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
354 } else {
355 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
356 }
357 }
358
359 /**
360 * Insert a preallocated instruction into the program.
361 */
362 instruction *
363 emit(instruction *inst) const
364 {
365 assert(inst->exec_size <= 32);
366 assert(inst->exec_size == dispatch_width() ||
367 force_writemask_all);
368
369 inst->group = _group;
370 inst->force_writemask_all = force_writemask_all;
371 inst->annotation = annotation.str;
372 inst->ir = annotation.ir;
373
374 if (block)
375 static_cast<instruction *>(cursor)->insert_before(block, inst);
376 else
377 cursor->insert_before(inst);
378
379 return inst;
380 }
381
382 /**
383 * Select \p src0 if the comparison of both sources with the given
384 * conditional mod evaluates to true, otherwise select \p src1.
385 *
386 * Generally useful to get the minimum or maximum of two values.
387 */
388 instruction *
389 emit_minmax(const dst_reg &dst, const src_reg &src0,
390 const src_reg &src1, brw_conditional_mod mod) const
391 {
392 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
393
394 /* In some cases we can't have bytes as operand for src1, so use the
395 * same type for both operand.
396 */
397 return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
398 fix_unsigned_negate(fix_byte_src(src1))));
399 }
400
401 /**
402 * Copy any live channel from \p src to the first channel of the result.
403 */
404 src_reg
405 emit_uniformize(const src_reg &src) const
406 {
407 /* FIXME: We use a vector chan_index and dst to allow constant and
408 * copy propagration to move result all the way into the consuming
409 * instruction (typically a surface index or sampler index for a
410 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
411 * dispatch. Once we teach const/copy propagation about scalars we
412 * should go back to scalar destinations here.
413 */
414 const fs_builder ubld = exec_all();
415 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
416 const dst_reg dst = vgrf(src.type);
417
418 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
419 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
420
421 return src_reg(component(dst, 0));
422 }
423
424 src_reg
425 move_to_vgrf(const src_reg &src, unsigned num_components) const
426 {
427 src_reg *const src_comps = new src_reg[num_components];
428 for (unsigned i = 0; i < num_components; i++)
429 src_comps[i] = offset(src, dispatch_width(), i);
430
431 const dst_reg dst = vgrf(src.type, num_components);
432 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
433
434 delete[] src_comps;
435
436 return src_reg(dst);
437 }
438
439 void
440 emit_scan(enum opcode opcode, const dst_reg &tmp,
441 unsigned cluster_size, brw_conditional_mod mod) const
442 {
443 assert(dispatch_width() >= 8);
444
445 /* The instruction splitting code isn't advanced enough to split
446 * these so we need to handle that ourselves.
447 */
448 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
449 const unsigned half_width = dispatch_width() / 2;
450 const fs_builder ubld = exec_all().group(half_width, 0);
451 dst_reg left = tmp;
452 dst_reg right = horiz_offset(tmp, half_width);
453 ubld.emit_scan(opcode, left, cluster_size, mod);
454 ubld.emit_scan(opcode, right, cluster_size, mod);
455 if (cluster_size > half_width) {
456 src_reg left_comp = component(left, half_width - 1);
457 set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
458 }
459 return;
460 }
461
462 if (cluster_size > 1) {
463 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
464 const dst_reg left = horiz_stride(tmp, 2);
465 const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
466 set_condmod(mod, ubld.emit(opcode, right, left, right));
467 }
468
469 if (cluster_size > 2) {
470 if (type_sz(tmp.type) <= 4) {
471 const fs_builder ubld =
472 exec_all().group(dispatch_width() / 4, 0);
473 src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
474
475 dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
476 set_condmod(mod, ubld.emit(opcode, right, left, right));
477
478 right = horiz_stride(horiz_offset(tmp, 3), 4);
479 set_condmod(mod, ubld.emit(opcode, right, left, right));
480 } else {
481 /* For 64-bit types, we have to do things differently because
482 * the code above would land us with destination strides that
483 * the hardware can't handle. Fortunately, we'll only be
484 * 8-wide in that case and it's the same number of
485 * instructions.
486 */
487 const fs_builder ubld = exec_all().group(2, 0);
488
489 for (unsigned i = 0; i < dispatch_width(); i += 4) {
490 src_reg left = component(tmp, i + 1);
491 dst_reg right = horiz_offset(tmp, i + 2);
492 set_condmod(mod, ubld.emit(opcode, right, left, right));
493 }
494 }
495 }
496
497 for (unsigned i = 4;
498 i < MIN2(cluster_size, dispatch_width());
499 i *= 2) {
500 const fs_builder ubld = exec_all().group(i, 0);
501 src_reg left = component(tmp, i - 1);
502 dst_reg right = horiz_offset(tmp, i);
503 set_condmod(mod, ubld.emit(opcode, right, left, right));
504
505 if (dispatch_width() > i * 2) {
506 left = component(tmp, i * 3 - 1);
507 right = horiz_offset(tmp, i * 3);
508 set_condmod(mod, ubld.emit(opcode, right, left, right));
509 }
510
511 if (dispatch_width() > i * 4) {
512 left = component(tmp, i * 5 - 1);
513 right = horiz_offset(tmp, i * 5);
514 set_condmod(mod, ubld.emit(opcode, right, left, right));
515
516 left = component(tmp, i * 7 - 1);
517 right = horiz_offset(tmp, i * 7);
518 set_condmod(mod, ubld.emit(opcode, right, left, right));
519 }
520 }
521 }
522
523 /**
524 * Assorted arithmetic ops.
525 * @{
526 */
527 #define ALU1(op) \
528 instruction * \
529 op(const dst_reg &dst, const src_reg &src0) const \
530 { \
531 return emit(BRW_OPCODE_##op, dst, src0); \
532 }
533
534 #define ALU2(op) \
535 instruction * \
536 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
537 { \
538 return emit(BRW_OPCODE_##op, dst, src0, src1); \
539 }
540
541 #define ALU2_ACC(op) \
542 instruction * \
543 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
544 { \
545 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
546 inst->writes_accumulator = true; \
547 return inst; \
548 }
549
550 #define ALU3(op) \
551 instruction * \
552 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
553 const src_reg &src2) const \
554 { \
555 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
556 }
557
558 ALU2(ADD)
559 ALU2_ACC(ADDC)
560 ALU2(AND)
561 ALU2(ASR)
562 ALU2(AVG)
563 ALU3(BFE)
564 ALU2(BFI1)
565 ALU3(BFI2)
566 ALU1(BFREV)
567 ALU1(CBIT)
568 ALU2(CMPN)
569 ALU1(DIM)
570 ALU2(DP2)
571 ALU2(DP3)
572 ALU2(DP4)
573 ALU2(DPH)
574 ALU1(F16TO32)
575 ALU1(F32TO16)
576 ALU1(FBH)
577 ALU1(FBL)
578 ALU1(FRC)
579 ALU2(LINE)
580 ALU1(LZD)
581 ALU2(MAC)
582 ALU2_ACC(MACH)
583 ALU3(MAD)
584 ALU1(MOV)
585 ALU2(MUL)
586 ALU1(NOT)
587 ALU2(OR)
588 ALU2(PLN)
589 ALU1(RNDD)
590 ALU1(RNDE)
591 ALU1(RNDU)
592 ALU1(RNDZ)
593 ALU2(ROL)
594 ALU2(ROR)
595 ALU2(SAD2)
596 ALU2_ACC(SADA2)
597 ALU2(SEL)
598 ALU2(SHL)
599 ALU2(SHR)
600 ALU2_ACC(SUBB)
601 ALU2(XOR)
602
603 #undef ALU3
604 #undef ALU2_ACC
605 #undef ALU2
606 #undef ALU1
607 /** @} */
608
609 /**
610 * CMP: Sets the low bit of the destination channels with the result
611 * of the comparison, while the upper bits are undefined, and updates
612 * the flag register with the packed 16 bits of the result.
613 */
614 instruction *
615 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
616 brw_conditional_mod condition) const
617 {
618 /* Take the instruction:
619 *
620 * CMP null<d> src0<f> src1<f>
621 *
622 * Original gen4 does type conversion to the destination type
623 * before comparison, producing garbage results for floating
624 * point comparisons.
625 *
626 * The destination type doesn't matter on newer generations,
627 * so we set the type to match src0 so we can compact the
628 * instruction.
629 */
630 return set_condmod(condition,
631 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
632 fix_unsigned_negate(src0),
633 fix_unsigned_negate(src1)));
634 }
635
636 /**
637 * Gen4 predicated IF.
638 */
639 instruction *
640 IF(brw_predicate predicate) const
641 {
642 return set_predicate(predicate, emit(BRW_OPCODE_IF));
643 }
644
645 /**
646 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
647 */
648 instruction *
649 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
650 const src_reg &src2, brw_conditional_mod condition) const
651 {
652 /* CSEL only operates on floats, so we can't do integer </<=/>=/>
653 * comparisons. Zero/non-zero (== and !=) comparisons almost work.
654 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
655 */
656 assert(src2.type == BRW_REGISTER_TYPE_F);
657
658 return set_condmod(condition,
659 emit(BRW_OPCODE_CSEL,
660 retype(dst, BRW_REGISTER_TYPE_F),
661 retype(src0, BRW_REGISTER_TYPE_F),
662 retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
663 fix_byte_src(src2)));
664 }
665
666 /**
667 * Emit a linear interpolation instruction.
668 */
669 instruction *
670 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
671 const src_reg &a) const
672 {
673 if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
674 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
675 * we need to reorder the operands.
676 */
677 return emit(BRW_OPCODE_LRP, dst, a, y, x);
678
679 } else {
680 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
681 const dst_reg y_times_a = vgrf(dst.type);
682 const dst_reg one_minus_a = vgrf(dst.type);
683 const dst_reg x_times_one_minus_a = vgrf(dst.type);
684
685 MUL(y_times_a, y, a);
686 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
687 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
688 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
689 }
690 }
691
692 /**
693 * Collect a number of registers in a contiguous range of registers.
694 */
695 instruction *
696 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
697 unsigned sources, unsigned header_size) const
698 {
699 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
700 inst->header_size = header_size;
701 inst->size_written = header_size * REG_SIZE;
702 for (unsigned i = header_size; i < sources; i++) {
703 inst->size_written +=
704 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
705 REG_SIZE);
706 }
707
708 return inst;
709 }
710
711 instruction *
712 UNDEF(const dst_reg &dst) const
713 {
714 assert(dst.file == VGRF);
715 instruction *inst = emit(SHADER_OPCODE_UNDEF,
716 retype(dst, BRW_REGISTER_TYPE_UD));
717 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
718
719 return inst;
720 }
721
722 backend_shader *shader;
723
724 /**
725 * Byte sized operands are not supported for src1 on Gen11+.
726 */
727 src_reg
728 fix_byte_src(const src_reg &src) const
729 {
730 if (shader->devinfo->gen < 11 || type_sz(src.type) != 1)
731 return src;
732
733 dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
734 BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
735 MOV(temp, src);
736 return src_reg(temp);
737 }
738
739 private:
740 /**
741 * Workaround for negation of UD registers. See comment in
742 * fs_generator::generate_code() for more details.
743 */
744 src_reg
745 fix_unsigned_negate(const src_reg &src) const
746 {
747 if (src.type == BRW_REGISTER_TYPE_UD &&
748 src.negate) {
749 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
750 MOV(temp, src);
751 return src_reg(temp);
752 } else {
753 return src;
754 }
755 }
756
757 /**
758 * Workaround for source register modes not supported by the ternary
759 * instruction encoding.
760 */
761 src_reg
762 fix_3src_operand(const src_reg &src) const
763 {
764 switch (src.file) {
765 case FIXED_GRF:
766 /* FINISHME: Could handle scalar region, other stride=1 regions */
767 if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
768 src.width != BRW_WIDTH_8 ||
769 src.hstride != BRW_HORIZONTAL_STRIDE_1)
770 break;
771 /* fallthrough */
772 case ATTR:
773 case VGRF:
774 case UNIFORM:
775 case IMM:
776 return src;
777 default:
778 break;
779 }
780
781 dst_reg expanded = vgrf(src.type);
782 MOV(expanded, src);
783 return expanded;
784 }
785
786 /**
787 * Workaround for source register modes not supported by the math
788 * instruction.
789 */
790 src_reg
791 fix_math_operand(const src_reg &src) const
792 {
793 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
794 * might be able to do better by doing execsize = 1 math and then
795 * expanding that result out, but we would need to be careful with
796 * masking.
797 *
798 * Gen6 hardware ignores source modifiers (negate and abs) on math
799 * instructions, so we also move to a temp to set those up.
800 *
801 * Gen7 relaxes most of the above restrictions, but still can't use IMM
802 * operands to math
803 */
804 if ((shader->devinfo->gen == 6 &&
805 (src.file == IMM || src.file == UNIFORM ||
806 src.abs || src.negate)) ||
807 (shader->devinfo->gen == 7 && src.file == IMM)) {
808 const dst_reg tmp = vgrf(src.type);
809 MOV(tmp, src);
810 return tmp;
811 } else {
812 return src;
813 }
814 }
815
816 bblock_t *block;
817 exec_node *cursor;
818
819 unsigned _dispatch_width;
820 unsigned _group;
821 bool force_writemask_all;
822
823 /** Debug annotation info. */
824 struct {
825 const char *str;
826 const void *ir;
827 } annotation;
828 };
829 }
830
831 #endif