i965: Move the back-end compiler to src/intel/compiler
[mesa.git] / src / intel / compiler / brw_vec4_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_VEC4_BUILDER_H
26 #define BRW_VEC4_BUILDER_H
27
28 #include "brw_ir_vec4.h"
29 #include "brw_ir_allocator.h"
30
31 namespace brw {
32 /**
33 * Toolbox to assemble a VEC4 IR program out of individual instructions.
34 *
35 * This object is meant to have an interface consistent with
36 * brw::fs_builder. They cannot be fully interchangeable because
37 * brw::fs_builder generates scalar code while brw::vec4_builder generates
38 * vector code.
39 */
40 class vec4_builder {
41 public:
42 /** Type used in this IR to represent a source of an instruction. */
43 typedef brw::src_reg src_reg;
44
45 /** Type used in this IR to represent the destination of an instruction. */
46 typedef brw::dst_reg dst_reg;
47
48 /** Type used in this IR to represent an instruction. */
49 typedef vec4_instruction instruction;
50
51 /**
52 * Construct a vec4_builder that inserts instructions into \p shader.
53 */
54 vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
55 shader(shader), block(NULL), cursor(NULL),
56 _dispatch_width(dispatch_width), _group(0),
57 force_writemask_all(false),
58 annotation()
59 {
60 }
61
62 /**
63 * Construct a vec4_builder that inserts instructions into \p shader
64 * before instruction \p inst in basic block \p block. The default
65 * execution controls and debug annotation are initialized from the
66 * instruction passed as argument.
67 */
68 vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
69 shader(shader), block(block), cursor(inst),
70 _dispatch_width(inst->exec_size), _group(inst->group),
71 force_writemask_all(inst->force_writemask_all)
72 {
73 annotation.str = inst->annotation;
74 annotation.ir = inst->ir;
75 }
76
77 /**
78 * Construct a vec4_builder that inserts instructions before \p cursor
79 * in basic block \p block, inheriting other code generation parameters
80 * from this.
81 */
82 vec4_builder
83 at(bblock_t *block, exec_node *cursor) const
84 {
85 vec4_builder bld = *this;
86 bld.block = block;
87 bld.cursor = cursor;
88 return bld;
89 }
90
91 /**
92 * Construct a vec4_builder appending instructions at the end of the
93 * instruction list of the shader, inheriting other code generation
94 * parameters from this.
95 */
96 vec4_builder
97 at_end() const
98 {
99 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
100 }
101
102 /**
103 * Construct a builder specifying the default SIMD width and group of
104 * channel enable signals, inheriting other code generation parameters
105 * from this.
106 *
107 * \p n gives the default SIMD width, \p i gives the slot group used for
108 * predication and control flow masking in multiples of \p n channels.
109 */
110 vec4_builder
111 group(unsigned n, unsigned i) const
112 {
113 assert(force_writemask_all ||
114 (n <= dispatch_width() && i < dispatch_width() / n));
115 vec4_builder bld = *this;
116 bld._dispatch_width = n;
117 bld._group += i * n;
118 return bld;
119 }
120
121 /**
122 * Construct a builder with per-channel control flow execution masking
123 * disabled if \p b is true. If control flow execution masking is
124 * already disabled this has no effect.
125 */
126 vec4_builder
127 exec_all(bool b = true) const
128 {
129 vec4_builder bld = *this;
130 if (b)
131 bld.force_writemask_all = true;
132 return bld;
133 }
134
135 /**
136 * Construct a builder with the given debug annotation info.
137 */
138 vec4_builder
139 annotate(const char *str, const void *ir = NULL) const
140 {
141 vec4_builder bld = *this;
142 bld.annotation.str = str;
143 bld.annotation.ir = ir;
144 return bld;
145 }
146
147 /**
148 * Get the SIMD width in use.
149 */
150 unsigned
151 dispatch_width() const
152 {
153 return _dispatch_width;
154 }
155
156 /**
157 * Get the channel group in use.
158 */
159 unsigned
160 group() const
161 {
162 return _group;
163 }
164
165 /**
166 * Allocate a virtual register of natural vector size (four for this IR)
167 * and SIMD width. \p n gives the amount of space to allocate in
168 * dispatch_width units (which is just enough space for four logical
169 * components in this IR).
170 */
171 dst_reg
172 vgrf(enum brw_reg_type type, unsigned n = 1) const
173 {
174 assert(dispatch_width() <= 32);
175
176 if (n > 0)
177 return retype(dst_reg(VGRF, shader->alloc.allocate(
178 n * DIV_ROUND_UP(type_sz(type), 4))),
179 type);
180 else
181 return retype(null_reg_ud(), type);
182 }
183
184 /**
185 * Create a null register of floating type.
186 */
187 dst_reg
188 null_reg_f() const
189 {
190 return dst_reg(retype(brw_null_vec(dispatch_width()),
191 BRW_REGISTER_TYPE_F));
192 }
193
194 /**
195 * Create a null register of signed integer type.
196 */
197 dst_reg
198 null_reg_d() const
199 {
200 return dst_reg(retype(brw_null_vec(dispatch_width()),
201 BRW_REGISTER_TYPE_D));
202 }
203
204 /**
205 * Create a null register of unsigned integer type.
206 */
207 dst_reg
208 null_reg_ud() const
209 {
210 return dst_reg(retype(brw_null_vec(dispatch_width()),
211 BRW_REGISTER_TYPE_UD));
212 }
213
214 /**
215 * Insert an instruction into the program.
216 */
217 instruction *
218 emit(const instruction &inst) const
219 {
220 return emit(new(shader->mem_ctx) instruction(inst));
221 }
222
223 /**
224 * Create and insert a nullary control instruction into the program.
225 */
226 instruction *
227 emit(enum opcode opcode) const
228 {
229 return emit(instruction(opcode));
230 }
231
232 /**
233 * Create and insert a nullary instruction into the program.
234 */
235 instruction *
236 emit(enum opcode opcode, const dst_reg &dst) const
237 {
238 return emit(instruction(opcode, dst));
239 }
240
241 /**
242 * Create and insert a unary instruction into the program.
243 */
244 instruction *
245 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
246 {
247 switch (opcode) {
248 case SHADER_OPCODE_RCP:
249 case SHADER_OPCODE_RSQ:
250 case SHADER_OPCODE_SQRT:
251 case SHADER_OPCODE_EXP2:
252 case SHADER_OPCODE_LOG2:
253 case SHADER_OPCODE_SIN:
254 case SHADER_OPCODE_COS:
255 return fix_math_instruction(
256 emit(instruction(opcode, dst,
257 fix_math_operand(src0))));
258
259 default:
260 return emit(instruction(opcode, dst, src0));
261 }
262 }
263
264 /**
265 * Create and insert a binary instruction into the program.
266 */
267 instruction *
268 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
269 const src_reg &src1) const
270 {
271 switch (opcode) {
272 case SHADER_OPCODE_POW:
273 case SHADER_OPCODE_INT_QUOTIENT:
274 case SHADER_OPCODE_INT_REMAINDER:
275 return fix_math_instruction(
276 emit(instruction(opcode, dst,
277 fix_math_operand(src0),
278 fix_math_operand(src1))));
279
280 default:
281 return emit(instruction(opcode, dst, src0, src1));
282 }
283 }
284
285 /**
286 * Create and insert a ternary instruction into the program.
287 */
288 instruction *
289 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
290 const src_reg &src1, const src_reg &src2) const
291 {
292 switch (opcode) {
293 case BRW_OPCODE_BFE:
294 case BRW_OPCODE_BFI2:
295 case BRW_OPCODE_MAD:
296 case BRW_OPCODE_LRP:
297 return emit(instruction(opcode, dst,
298 fix_3src_operand(src0),
299 fix_3src_operand(src1),
300 fix_3src_operand(src2)));
301
302 default:
303 return emit(instruction(opcode, dst, src0, src1, src2));
304 }
305 }
306
307 /**
308 * Insert a preallocated instruction into the program.
309 */
310 instruction *
311 emit(instruction *inst) const
312 {
313 inst->exec_size = dispatch_width();
314 inst->group = group();
315 inst->force_writemask_all = force_writemask_all;
316 inst->size_written = inst->exec_size * type_sz(inst->dst.type);
317 inst->annotation = annotation.str;
318 inst->ir = annotation.ir;
319
320 if (block)
321 static_cast<instruction *>(cursor)->insert_before(block, inst);
322 else
323 cursor->insert_before(inst);
324
325 return inst;
326 }
327
328 /**
329 * Select \p src0 if the comparison of both sources with the given
330 * conditional mod evaluates to true, otherwise select \p src1.
331 *
332 * Generally useful to get the minimum or maximum of two values.
333 */
334 instruction *
335 emit_minmax(const dst_reg &dst, const src_reg &src0,
336 const src_reg &src1, brw_conditional_mod mod) const
337 {
338 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
339
340 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
341 fix_unsigned_negate(src1)));
342 }
343
344 /**
345 * Copy any live channel from \p src to the first channel of the result.
346 */
347 src_reg
348 emit_uniformize(const src_reg &src) const
349 {
350 const vec4_builder ubld = exec_all();
351 const dst_reg chan_index =
352 writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
353 const dst_reg dst = vgrf(src.type);
354
355 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
356 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
357
358 return src_reg(dst);
359 }
360
361 /**
362 * Assorted arithmetic ops.
363 * @{
364 */
365 #define ALU1(op) \
366 instruction * \
367 op(const dst_reg &dst, const src_reg &src0) const \
368 { \
369 return emit(BRW_OPCODE_##op, dst, src0); \
370 }
371
372 #define ALU2(op) \
373 instruction * \
374 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
375 { \
376 return emit(BRW_OPCODE_##op, dst, src0, src1); \
377 }
378
379 #define ALU2_ACC(op) \
380 instruction * \
381 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
382 { \
383 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
384 inst->writes_accumulator = true; \
385 return inst; \
386 }
387
388 #define ALU3(op) \
389 instruction * \
390 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
391 const src_reg &src2) const \
392 { \
393 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
394 }
395
396 ALU2(ADD)
397 ALU2_ACC(ADDC)
398 ALU2(AND)
399 ALU2(ASR)
400 ALU2(AVG)
401 ALU3(BFE)
402 ALU2(BFI1)
403 ALU3(BFI2)
404 ALU1(BFREV)
405 ALU1(CBIT)
406 ALU2(CMPN)
407 ALU3(CSEL)
408 ALU1(DIM)
409 ALU2(DP2)
410 ALU2(DP3)
411 ALU2(DP4)
412 ALU2(DPH)
413 ALU1(F16TO32)
414 ALU1(F32TO16)
415 ALU1(FBH)
416 ALU1(FBL)
417 ALU1(FRC)
418 ALU2(LINE)
419 ALU1(LZD)
420 ALU2(MAC)
421 ALU2_ACC(MACH)
422 ALU3(MAD)
423 ALU1(MOV)
424 ALU2(MUL)
425 ALU1(NOT)
426 ALU2(OR)
427 ALU2(PLN)
428 ALU1(RNDD)
429 ALU1(RNDE)
430 ALU1(RNDU)
431 ALU1(RNDZ)
432 ALU2(SAD2)
433 ALU2_ACC(SADA2)
434 ALU2(SEL)
435 ALU2(SHL)
436 ALU2(SHR)
437 ALU2_ACC(SUBB)
438 ALU2(XOR)
439
440 #undef ALU3
441 #undef ALU2_ACC
442 #undef ALU2
443 #undef ALU1
444 /** @} */
445
446 /**
447 * CMP: Sets the low bit of the destination channels with the result
448 * of the comparison, while the upper bits are undefined, and updates
449 * the flag register with the packed 16 bits of the result.
450 */
451 instruction *
452 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
453 brw_conditional_mod condition) const
454 {
455 /* Take the instruction:
456 *
457 * CMP null<d> src0<f> src1<f>
458 *
459 * Original gen4 does type conversion to the destination type
460 * before comparison, producing garbage results for floating
461 * point comparisons.
462 *
463 * The destination type doesn't matter on newer generations,
464 * so we set the type to match src0 so we can compact the
465 * instruction.
466 */
467 return set_condmod(condition,
468 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
469 fix_unsigned_negate(src0),
470 fix_unsigned_negate(src1)));
471 }
472
473 /**
474 * Gen4 predicated IF.
475 */
476 instruction *
477 IF(brw_predicate predicate) const
478 {
479 return set_predicate(predicate, emit(BRW_OPCODE_IF));
480 }
481
482 /**
483 * Gen6 IF with embedded comparison.
484 */
485 instruction *
486 IF(const src_reg &src0, const src_reg &src1,
487 brw_conditional_mod condition) const
488 {
489 assert(shader->devinfo->gen == 6);
490 return set_condmod(condition,
491 emit(BRW_OPCODE_IF,
492 null_reg_d(),
493 fix_unsigned_negate(src0),
494 fix_unsigned_negate(src1)));
495 }
496
497 /**
498 * Emit a linear interpolation instruction.
499 */
500 instruction *
501 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
502 const src_reg &a) const
503 {
504 if (shader->devinfo->gen >= 6) {
505 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
506 * we need to reorder the operands.
507 */
508 return emit(BRW_OPCODE_LRP, dst, a, y, x);
509
510 } else {
511 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
512 const dst_reg y_times_a = vgrf(dst.type);
513 const dst_reg one_minus_a = vgrf(dst.type);
514 const dst_reg x_times_one_minus_a = vgrf(dst.type);
515
516 MUL(y_times_a, y, a);
517 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
518 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
519 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
520 }
521 }
522
523 backend_shader *shader;
524
525 protected:
526 /**
527 * Workaround for negation of UD registers. See comment in
528 * fs_generator::generate_code() for the details.
529 */
530 src_reg
531 fix_unsigned_negate(const src_reg &src) const
532 {
533 if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
534 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
535 MOV(temp, src);
536 return src_reg(temp);
537 } else {
538 return src;
539 }
540 }
541
542 /**
543 * Workaround for register access modes not supported by the ternary
544 * instruction encoding.
545 */
546 src_reg
547 fix_3src_operand(const src_reg &src) const
548 {
549 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
550 * able to use vertical stride of zero to replicate the vec4 uniform, like
551 *
552 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
553 *
554 * But you can't, since vertical stride is always four in three-source
555 * instructions. Instead, insert a MOV instruction to do the replication so
556 * that the three-source instruction can consume it.
557 */
558
559 /* The MOV is only needed if the source is a uniform or immediate. */
560 if (src.file != UNIFORM && src.file != IMM)
561 return src;
562
563 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
564 return src;
565
566 const dst_reg expanded = vgrf(src.type);
567 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
568 return src_reg(expanded);
569 }
570
571 /**
572 * Workaround for register access modes not supported by the math
573 * instruction.
574 */
575 src_reg
576 fix_math_operand(const src_reg &src) const
577 {
578 /* The gen6 math instruction ignores the source modifiers --
579 * swizzle, abs, negate, and at least some parts of the register
580 * region description.
581 *
582 * Rather than trying to enumerate all these cases, *always* expand the
583 * operand to a temp GRF for gen6.
584 *
585 * For gen7, keep the operand as-is, except if immediate, which gen7 still
586 * can't use.
587 */
588 if (shader->devinfo->gen == 6 ||
589 (shader->devinfo->gen == 7 && src.file == IMM)) {
590 const dst_reg tmp = vgrf(src.type);
591 MOV(tmp, src);
592 return src_reg(tmp);
593 } else {
594 return src;
595 }
596 }
597
598 /**
599 * Workaround other weirdness of the math instruction.
600 */
601 instruction *
602 fix_math_instruction(instruction *inst) const
603 {
604 if (shader->devinfo->gen == 6 &&
605 inst->dst.writemask != WRITEMASK_XYZW) {
606 const dst_reg tmp = vgrf(inst->dst.type);
607 MOV(inst->dst, src_reg(tmp));
608 inst->dst = tmp;
609
610 } else if (shader->devinfo->gen < 6) {
611 const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
612 inst->base_mrf = 1;
613 inst->mlen = sources;
614 }
615
616 return inst;
617 }
618
619 bblock_t *block;
620 exec_node *cursor;
621
622 unsigned _dispatch_width;
623 unsigned _group;
624 bool force_writemask_all;
625
626 /** Debug annotation info. */
627 struct {
628 const char *str;
629 const void *ir;
630 } annotation;
631 };
632 }
633
634 #endif