intel/fs: Fix a typo in need_matching_subreg_offset
[mesa.git] / src / intel / compiler / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30
31 namespace brw {
32 /**
33 * Toolbox to assemble an FS IR program out of individual instructions.
34 *
35 * This object is meant to have an interface consistent with
36 * brw::vec4_builder. They cannot be fully interchangeable because
37 * brw::fs_builder generates scalar code while brw::vec4_builder generates
38 * vector code.
39 */
40 class fs_builder {
41 public:
42 /** Type used in this IR to represent a source of an instruction. */
43 typedef fs_reg src_reg;
44
45 /** Type used in this IR to represent the destination of an instruction. */
46 typedef fs_reg dst_reg;
47
48 /** Type used in this IR to represent an instruction. */
49 typedef fs_inst instruction;
50
51 /**
52 * Construct an fs_builder that inserts instructions into \p shader.
53 * \p dispatch_width gives the native execution width of the program.
54 */
55 fs_builder(backend_shader *shader,
56 unsigned dispatch_width) :
57 shader(shader), block(NULL), cursor(NULL),
58 _dispatch_width(dispatch_width),
59 _group(0),
60 force_writemask_all(false),
61 annotation()
62 {
63 }
64
65 /**
66 * Construct an fs_builder that inserts instructions into \p shader
67 * before instruction \p inst in basic block \p block. The default
68 * execution controls and debug annotation are initialized from the
69 * instruction passed as argument.
70 */
71 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72 shader(shader), block(block), cursor(inst),
73 _dispatch_width(inst->exec_size),
74 _group(inst->group),
75 force_writemask_all(inst->force_writemask_all)
76 {
77 annotation.str = inst->annotation;
78 annotation.ir = inst->ir;
79 }
80
81 /**
82 * Construct an fs_builder that inserts instructions before \p cursor in
83 * basic block \p block, inheriting other code generation parameters
84 * from this.
85 */
86 fs_builder
87 at(bblock_t *block, exec_node *cursor) const
88 {
89 fs_builder bld = *this;
90 bld.block = block;
91 bld.cursor = cursor;
92 return bld;
93 }
94
95 /**
96 * Construct an fs_builder appending instructions at the end of the
97 * instruction list of the shader, inheriting other code generation
98 * parameters from this.
99 */
100 fs_builder
101 at_end() const
102 {
103 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104 }
105
106 /**
107 * Construct a builder specifying the default SIMD width and group of
108 * channel enable signals, inheriting other code generation parameters
109 * from this.
110 *
111 * \p n gives the default SIMD width, \p i gives the slot group used for
112 * predication and control flow masking in multiples of \p n channels.
113 */
114 fs_builder
115 group(unsigned n, unsigned i) const
116 {
117 assert(force_writemask_all ||
118 (n <= dispatch_width() && i < dispatch_width() / n));
119 fs_builder bld = *this;
120 bld._dispatch_width = n;
121 bld._group += i * n;
122 return bld;
123 }
124
125 /**
126 * Alias for group() with width equal to eight.
127 */
128 fs_builder
129 half(unsigned i) const
130 {
131 return group(8, i);
132 }
133
134 /**
135 * Construct a builder with per-channel control flow execution masking
136 * disabled if \p b is true. If control flow execution masking is
137 * already disabled this has no effect.
138 */
139 fs_builder
140 exec_all(bool b = true) const
141 {
142 fs_builder bld = *this;
143 if (b)
144 bld.force_writemask_all = true;
145 return bld;
146 }
147
148 /**
149 * Construct a builder with the given debug annotation info.
150 */
151 fs_builder
152 annotate(const char *str, const void *ir = NULL) const
153 {
154 fs_builder bld = *this;
155 bld.annotation.str = str;
156 bld.annotation.ir = ir;
157 return bld;
158 }
159
160 /**
161 * Get the SIMD width in use.
162 */
163 unsigned
164 dispatch_width() const
165 {
166 return _dispatch_width;
167 }
168
169 /**
170 * Get the channel group in use.
171 */
172 unsigned
173 group() const
174 {
175 return _group;
176 }
177
178 /**
179 * Allocate a virtual register of natural vector size (one for this IR)
180 * and SIMD width. \p n gives the amount of space to allocate in
181 * dispatch_width units (which is just enough space for one logical
182 * component in this IR).
183 */
184 dst_reg
185 vgrf(enum brw_reg_type type, unsigned n = 1) const
186 {
187 assert(dispatch_width() <= 32);
188
189 if (n > 0)
190 return dst_reg(VGRF, shader->alloc.allocate(
191 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
192 REG_SIZE)),
193 type);
194 else
195 return retype(null_reg_ud(), type);
196 }
197
198 /**
199 * Create a null register of floating type.
200 */
201 dst_reg
202 null_reg_f() const
203 {
204 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
205 }
206
207 dst_reg
208 null_reg_df() const
209 {
210 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
211 }
212
213 /**
214 * Create a null register of signed integer type.
215 */
216 dst_reg
217 null_reg_d() const
218 {
219 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
220 }
221
222 /**
223 * Create a null register of unsigned integer type.
224 */
225 dst_reg
226 null_reg_ud() const
227 {
228 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
229 }
230
231 /**
232 * Get the mask of SIMD channels enabled by dispatch and not yet
233 * disabled by discard.
234 */
235 src_reg
236 sample_mask_reg() const
237 {
238 if (shader->stage != MESA_SHADER_FRAGMENT) {
239 return brw_imm_d(0xffffffff);
240 } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
241 return brw_flag_reg(0, 1);
242 } else {
243 assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16);
244 return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7),
245 BRW_REGISTER_TYPE_UD);
246 }
247 }
248
249 /**
250 * Insert an instruction into the program.
251 */
252 instruction *
253 emit(const instruction &inst) const
254 {
255 return emit(new(shader->mem_ctx) instruction(inst));
256 }
257
258 /**
259 * Create and insert a nullary control instruction into the program.
260 */
261 instruction *
262 emit(enum opcode opcode) const
263 {
264 return emit(instruction(opcode, dispatch_width()));
265 }
266
267 /**
268 * Create and insert a nullary instruction into the program.
269 */
270 instruction *
271 emit(enum opcode opcode, const dst_reg &dst) const
272 {
273 return emit(instruction(opcode, dispatch_width(), dst));
274 }
275
276 /**
277 * Create and insert a unary instruction into the program.
278 */
279 instruction *
280 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
281 {
282 switch (opcode) {
283 case SHADER_OPCODE_RCP:
284 case SHADER_OPCODE_RSQ:
285 case SHADER_OPCODE_SQRT:
286 case SHADER_OPCODE_EXP2:
287 case SHADER_OPCODE_LOG2:
288 case SHADER_OPCODE_SIN:
289 case SHADER_OPCODE_COS:
290 return emit(instruction(opcode, dispatch_width(), dst,
291 fix_math_operand(src0)));
292
293 default:
294 return emit(instruction(opcode, dispatch_width(), dst, src0));
295 }
296 }
297
298 /**
299 * Create and insert a binary instruction into the program.
300 */
301 instruction *
302 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
303 const src_reg &src1) const
304 {
305 switch (opcode) {
306 case SHADER_OPCODE_POW:
307 case SHADER_OPCODE_INT_QUOTIENT:
308 case SHADER_OPCODE_INT_REMAINDER:
309 return emit(instruction(opcode, dispatch_width(), dst,
310 fix_math_operand(src0),
311 fix_math_operand(src1)));
312
313 default:
314 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
315
316 }
317 }
318
319 /**
320 * Create and insert a ternary instruction into the program.
321 */
322 instruction *
323 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
324 const src_reg &src1, const src_reg &src2) const
325 {
326 switch (opcode) {
327 case BRW_OPCODE_BFE:
328 case BRW_OPCODE_BFI2:
329 case BRW_OPCODE_MAD:
330 case BRW_OPCODE_LRP:
331 return emit(instruction(opcode, dispatch_width(), dst,
332 fix_3src_operand(src0),
333 fix_3src_operand(src1),
334 fix_3src_operand(src2)));
335
336 default:
337 return emit(instruction(opcode, dispatch_width(), dst,
338 src0, src1, src2));
339 }
340 }
341
342 /**
343 * Create and insert an instruction with a variable number of sources
344 * into the program.
345 */
346 instruction *
347 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
348 unsigned n) const
349 {
350 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
351 }
352
353 /**
354 * Insert a preallocated instruction into the program.
355 */
356 instruction *
357 emit(instruction *inst) const
358 {
359 assert(inst->exec_size <= 32);
360 assert(inst->exec_size == dispatch_width() ||
361 force_writemask_all);
362
363 inst->group = _group;
364 inst->force_writemask_all = force_writemask_all;
365 inst->annotation = annotation.str;
366 inst->ir = annotation.ir;
367
368 if (block)
369 static_cast<instruction *>(cursor)->insert_before(block, inst);
370 else
371 cursor->insert_before(inst);
372
373 return inst;
374 }
375
376 /**
377 * Select \p src0 if the comparison of both sources with the given
378 * conditional mod evaluates to true, otherwise select \p src1.
379 *
380 * Generally useful to get the minimum or maximum of two values.
381 */
382 instruction *
383 emit_minmax(const dst_reg &dst, const src_reg &src0,
384 const src_reg &src1, brw_conditional_mod mod) const
385 {
386 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
387
388 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
389 fix_unsigned_negate(src1)));
390 }
391
392 /**
393 * Copy any live channel from \p src to the first channel of the result.
394 */
395 src_reg
396 emit_uniformize(const src_reg &src) const
397 {
398 /* FIXME: We use a vector chan_index and dst to allow constant and
399 * copy propagration to move result all the way into the consuming
400 * instruction (typically a surface index or sampler index for a
401 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
402 * dispatch. Once we teach const/copy propagation about scalars we
403 * should go back to scalar destinations here.
404 */
405 const fs_builder ubld = exec_all();
406 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
407 const dst_reg dst = vgrf(src.type);
408
409 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2;
410 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
411
412 return src_reg(component(dst, 0));
413 }
414
415 void
416 emit_scan(enum opcode opcode, const dst_reg &tmp,
417 unsigned cluster_size, brw_conditional_mod mod) const
418 {
419 assert(dispatch_width() >= 8);
420
421 /* The instruction splitting code isn't advanced enough to split
422 * these so we need to handle that ourselves.
423 */
424 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
425 const unsigned half_width = dispatch_width() / 2;
426 const fs_builder ubld = exec_all().group(half_width, 0);
427 dst_reg left = tmp;
428 dst_reg right = horiz_offset(tmp, half_width);
429 ubld.emit_scan(opcode, left, cluster_size, mod);
430 ubld.emit_scan(opcode, right, cluster_size, mod);
431 if (cluster_size > half_width) {
432 src_reg left_comp = component(left, half_width - 1);
433 set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
434 }
435 return;
436 }
437
438 if (cluster_size > 1) {
439 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
440 dst_reg left = horiz_stride(tmp, 2);
441 dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
442
443 /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
444 *
445 * "When source or destination datatype is 64b or operation is
446 * integer DWord multiply, regioning in Align1 must follow
447 * these rules:
448 *
449 * [...]
450 *
451 * 3. Source and Destination offset must be the same, except
452 * the case of scalar source."
453 *
454 * In order to work around this, we create a temporary register
455 * and shift left over to match right. If we have a 64-bit type,
456 * we have to use two integer MOVs instead of a 64-bit MOV.
457 */
458 if (need_matching_subreg_offset(opcode, tmp.type)) {
459 dst_reg tmp2 = vgrf(tmp.type);
460 dst_reg new_left = horiz_stride(horiz_offset(tmp2, 1), 2);
461 if (type_sz(tmp.type) > 4) {
462 ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 0),
463 subscript(left, BRW_REGISTER_TYPE_D, 0));
464 ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 1),
465 subscript(left, BRW_REGISTER_TYPE_D, 1));
466 } else {
467 ubld.MOV(new_left, left);
468 }
469 left = new_left;
470 }
471 set_condmod(mod, ubld.emit(opcode, right, left, right));
472 }
473
474 if (cluster_size > 2) {
475 if (type_sz(tmp.type) <= 4 &&
476 !need_matching_subreg_offset(opcode, tmp.type)) {
477 const fs_builder ubld =
478 exec_all().group(dispatch_width() / 4, 0);
479 src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
480
481 dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
482 set_condmod(mod, ubld.emit(opcode, right, left, right));
483
484 right = horiz_stride(horiz_offset(tmp, 3), 4);
485 set_condmod(mod, ubld.emit(opcode, right, left, right));
486 } else {
487 /* For 64-bit types, we have to do things differently because
488 * the code above would land us with destination strides that
489 * the hardware can't handle. Fortunately, we'll only be
490 * 8-wide in that case and it's the same number of
491 * instructions.
492 */
493 const fs_builder ubld = exec_all().group(2, 0);
494
495 for (unsigned i = 0; i < dispatch_width(); i += 4) {
496 src_reg left = component(tmp, i + 1);
497 dst_reg right = horiz_offset(tmp, i + 2);
498 set_condmod(mod, ubld.emit(opcode, right, left, right));
499 }
500 }
501 }
502
503 if (cluster_size > 4) {
504 const fs_builder ubld = exec_all().group(4, 0);
505 src_reg left = component(tmp, 3);
506 dst_reg right = horiz_offset(tmp, 4);
507 set_condmod(mod, ubld.emit(opcode, right, left, right));
508
509 if (dispatch_width() > 8) {
510 left = component(tmp, 8 + 3);
511 right = horiz_offset(tmp, 8 + 4);
512 set_condmod(mod, ubld.emit(opcode, right, left, right));
513 }
514 }
515
516 if (cluster_size > 8 && dispatch_width() > 8) {
517 const fs_builder ubld = exec_all().group(8, 0);
518 src_reg left = component(tmp, 7);
519 dst_reg right = horiz_offset(tmp, 8);
520 set_condmod(mod, ubld.emit(opcode, right, left, right));
521 }
522 }
523
524 /**
525 * Assorted arithmetic ops.
526 * @{
527 */
528 #define ALU1(op) \
529 instruction * \
530 op(const dst_reg &dst, const src_reg &src0) const \
531 { \
532 return emit(BRW_OPCODE_##op, dst, src0); \
533 }
534
535 #define ALU2(op) \
536 instruction * \
537 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
538 { \
539 return emit(BRW_OPCODE_##op, dst, src0, src1); \
540 }
541
542 #define ALU2_ACC(op) \
543 instruction * \
544 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
545 { \
546 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
547 inst->writes_accumulator = true; \
548 return inst; \
549 }
550
551 #define ALU3(op) \
552 instruction * \
553 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
554 const src_reg &src2) const \
555 { \
556 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
557 }
558
559 ALU2(ADD)
560 ALU2_ACC(ADDC)
561 ALU2(AND)
562 ALU2(ASR)
563 ALU2(AVG)
564 ALU3(BFE)
565 ALU2(BFI1)
566 ALU3(BFI2)
567 ALU1(BFREV)
568 ALU1(CBIT)
569 ALU2(CMPN)
570 ALU1(DIM)
571 ALU2(DP2)
572 ALU2(DP3)
573 ALU2(DP4)
574 ALU2(DPH)
575 ALU1(F16TO32)
576 ALU1(F32TO16)
577 ALU1(FBH)
578 ALU1(FBL)
579 ALU1(FRC)
580 ALU2(LINE)
581 ALU1(LZD)
582 ALU2(MAC)
583 ALU2_ACC(MACH)
584 ALU3(MAD)
585 ALU1(MOV)
586 ALU2(MUL)
587 ALU1(NOT)
588 ALU2(OR)
589 ALU2(PLN)
590 ALU1(RNDD)
591 ALU1(RNDE)
592 ALU1(RNDU)
593 ALU1(RNDZ)
594 ALU2(SAD2)
595 ALU2_ACC(SADA2)
596 ALU2(SEL)
597 ALU2(SHL)
598 ALU2(SHR)
599 ALU2_ACC(SUBB)
600 ALU2(XOR)
601
602 #undef ALU3
603 #undef ALU2_ACC
604 #undef ALU2
605 #undef ALU1
606 /** @} */
607
608 /**
609 * CMP: Sets the low bit of the destination channels with the result
610 * of the comparison, while the upper bits are undefined, and updates
611 * the flag register with the packed 16 bits of the result.
612 */
613 instruction *
614 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
615 brw_conditional_mod condition) const
616 {
617 /* Take the instruction:
618 *
619 * CMP null<d> src0<f> src1<f>
620 *
621 * Original gen4 does type conversion to the destination type
622 * before comparison, producing garbage results for floating
623 * point comparisons.
624 *
625 * The destination type doesn't matter on newer generations,
626 * so we set the type to match src0 so we can compact the
627 * instruction.
628 */
629 return set_condmod(condition,
630 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
631 fix_unsigned_negate(src0),
632 fix_unsigned_negate(src1)));
633 }
634
635 /**
636 * Gen4 predicated IF.
637 */
638 instruction *
639 IF(brw_predicate predicate) const
640 {
641 return set_predicate(predicate, emit(BRW_OPCODE_IF));
642 }
643
644 /**
645 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
646 */
647 instruction *
648 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
649 const src_reg &src2, brw_conditional_mod condition) const
650 {
651 /* CSEL only operates on floats, so we can't do integer </<=/>=/>
652 * comparisons. Zero/non-zero (== and !=) comparisons almost work.
653 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
654 */
655 assert(src2.type == BRW_REGISTER_TYPE_F);
656
657 return set_condmod(condition,
658 emit(BRW_OPCODE_CSEL,
659 retype(dst, BRW_REGISTER_TYPE_F),
660 retype(src0, BRW_REGISTER_TYPE_F),
661 retype(src1, BRW_REGISTER_TYPE_F),
662 src2));
663 }
664
665 /**
666 * Emit a linear interpolation instruction.
667 */
668 instruction *
669 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
670 const src_reg &a) const
671 {
672 if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
673 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
674 * we need to reorder the operands.
675 */
676 return emit(BRW_OPCODE_LRP, dst, a, y, x);
677
678 } else {
679 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
680 const dst_reg y_times_a = vgrf(dst.type);
681 const dst_reg one_minus_a = vgrf(dst.type);
682 const dst_reg x_times_one_minus_a = vgrf(dst.type);
683
684 MUL(y_times_a, y, a);
685 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
686 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
687 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
688 }
689 }
690
691 /**
692 * Collect a number of registers in a contiguous range of registers.
693 */
694 instruction *
695 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
696 unsigned sources, unsigned header_size) const
697 {
698 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
699 inst->header_size = header_size;
700 inst->size_written = header_size * REG_SIZE;
701 for (unsigned i = header_size; i < sources; i++) {
702 inst->size_written +=
703 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
704 REG_SIZE);
705 }
706
707 return inst;
708 }
709
710 backend_shader *shader;
711
712 private:
713 /**
714 * Workaround for negation of UD registers. See comment in
715 * fs_generator::generate_code() for more details.
716 */
717 src_reg
718 fix_unsigned_negate(const src_reg &src) const
719 {
720 if (src.type == BRW_REGISTER_TYPE_UD &&
721 src.negate) {
722 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
723 MOV(temp, src);
724 return src_reg(temp);
725 } else {
726 return src;
727 }
728 }
729
730 /**
731 * Workaround for source register modes not supported by the ternary
732 * instruction encoding.
733 */
734 src_reg
735 fix_3src_operand(const src_reg &src) const
736 {
737 if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
738 return src;
739 } else {
740 dst_reg expanded = vgrf(src.type);
741 MOV(expanded, src);
742 return expanded;
743 }
744 }
745
746 /**
747 * Workaround for source register modes not supported by the math
748 * instruction.
749 */
750 src_reg
751 fix_math_operand(const src_reg &src) const
752 {
753 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
754 * might be able to do better by doing execsize = 1 math and then
755 * expanding that result out, but we would need to be careful with
756 * masking.
757 *
758 * Gen6 hardware ignores source modifiers (negate and abs) on math
759 * instructions, so we also move to a temp to set those up.
760 *
761 * Gen7 relaxes most of the above restrictions, but still can't use IMM
762 * operands to math
763 */
764 if ((shader->devinfo->gen == 6 &&
765 (src.file == IMM || src.file == UNIFORM ||
766 src.abs || src.negate)) ||
767 (shader->devinfo->gen == 7 && src.file == IMM)) {
768 const dst_reg tmp = vgrf(src.type);
769 MOV(tmp, src);
770 return tmp;
771 } else {
772 return src;
773 }
774 }
775
776
777 /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
778 *
779 * "When source or destination datatype is 64b or operation is
780 * integer DWord multiply, regioning in Align1 must follow
781 * these rules:
782 *
783 * [...]
784 *
785 * 3. Source and Destination offset must be the same, except
786 * the case of scalar source."
787 *
788 * This helper just detects when we're in this case.
789 */
790 bool
791 need_matching_subreg_offset(enum opcode opcode,
792 enum brw_reg_type type) const
793 {
794 if (!shader->devinfo->is_cherryview &&
795 !gen_device_info_is_9lp(shader->devinfo))
796 return false;
797
798 if (type_sz(type) > 4)
799 return true;
800
801 if (opcode == BRW_OPCODE_MUL &&
802 !brw_reg_type_is_floating_point(type))
803 return true;
804
805 return false;
806 }
807
808 bblock_t *block;
809 exec_node *cursor;
810
811 unsigned _dispatch_width;
812 unsigned _group;
813 bool force_writemask_all;
814
815 /** Debug annotation info. */
816 struct {
817 const char *str;
818 const void *ir;
819 } annotation;
820 };
821 }
822
823 #endif