mesa/i965/i915/r200: eliminate gl_vertex_program
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_VEC4_BUILDER_H
26 #define BRW_VEC4_BUILDER_H
27
28 #include "brw_ir_vec4.h"
29 #include "brw_ir_allocator.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble a VEC4 IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::fs_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class vec4_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef brw::src_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef brw::dst_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef vec4_instruction instruction;
51
52 /**
53 * Construct a vec4_builder that inserts instructions into \p shader.
54 */
55 vec4_builder(backend_shader *shader) :
56 shader(shader), block(NULL), cursor(NULL),
57 force_writemask_all(false),
58 annotation()
59 {
60 }
61
62 /**
63 * Construct a vec4_builder that inserts instructions into \p shader
64 * before instruction \p inst in basic block \p block. The default
65 * execution controls and debug annotation are initialized from the
66 * instruction passed as argument.
67 */
68 vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
69 shader(shader), block(block), cursor(inst),
70 force_writemask_all(inst->force_writemask_all)
71 {
72 annotation.str = inst->annotation;
73 annotation.ir = inst->ir;
74 }
75
76 /**
77 * Construct a vec4_builder that inserts instructions before \p cursor
78 * in basic block \p block, inheriting other code generation parameters
79 * from this.
80 */
81 vec4_builder
82 at(bblock_t *block, exec_node *cursor) const
83 {
84 vec4_builder bld = *this;
85 bld.block = block;
86 bld.cursor = cursor;
87 return bld;
88 }
89
90 /**
91 * Construct a vec4_builder appending instructions at the end of the
92 * instruction list of the shader, inheriting other code generation
93 * parameters from this.
94 */
95 vec4_builder
96 at_end() const
97 {
98 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
99 }
100
101 /**
102 * Construct a builder with per-channel control flow execution masking
103 * disabled if \p b is true. If control flow execution masking is
104 * already disabled this has no effect.
105 */
106 vec4_builder
107 exec_all(bool b = true) const
108 {
109 vec4_builder bld = *this;
110 if (b)
111 bld.force_writemask_all = true;
112 return bld;
113 }
114
115 /**
116 * Construct a builder with the given debug annotation info.
117 */
118 vec4_builder
119 annotate(const char *str, const void *ir = NULL) const
120 {
121 vec4_builder bld = *this;
122 bld.annotation.str = str;
123 bld.annotation.ir = ir;
124 return bld;
125 }
126
127 /**
128 * Get the SIMD width in use.
129 */
130 unsigned
131 dispatch_width() const
132 {
133 return 8;
134 }
135
136 /**
137 * Allocate a virtual register of natural vector size (four for this IR)
138 * and SIMD width. \p n gives the amount of space to allocate in
139 * dispatch_width units (which is just enough space for four logical
140 * components in this IR).
141 */
142 dst_reg
143 vgrf(enum brw_reg_type type, unsigned n = 1) const
144 {
145 assert(dispatch_width() <= 32);
146
147 if (n > 0)
148 return retype(dst_reg(VGRF, shader->alloc.allocate(
149 n * DIV_ROUND_UP(type_sz(type), 4))),
150 type);
151 else
152 return retype(null_reg_ud(), type);
153 }
154
155 /**
156 * Create a null register of floating type.
157 */
158 dst_reg
159 null_reg_f() const
160 {
161 return dst_reg(retype(brw_null_vec(dispatch_width()),
162 BRW_REGISTER_TYPE_F));
163 }
164
165 /**
166 * Create a null register of signed integer type.
167 */
168 dst_reg
169 null_reg_d() const
170 {
171 return dst_reg(retype(brw_null_vec(dispatch_width()),
172 BRW_REGISTER_TYPE_D));
173 }
174
175 /**
176 * Create a null register of unsigned integer type.
177 */
178 dst_reg
179 null_reg_ud() const
180 {
181 return dst_reg(retype(brw_null_vec(dispatch_width()),
182 BRW_REGISTER_TYPE_UD));
183 }
184
185 /**
186 * Insert an instruction into the program.
187 */
188 instruction *
189 emit(const instruction &inst) const
190 {
191 return emit(new(shader->mem_ctx) instruction(inst));
192 }
193
194 /**
195 * Create and insert a nullary control instruction into the program.
196 */
197 instruction *
198 emit(enum opcode opcode) const
199 {
200 return emit(instruction(opcode));
201 }
202
203 /**
204 * Create and insert a nullary instruction into the program.
205 */
206 instruction *
207 emit(enum opcode opcode, const dst_reg &dst) const
208 {
209 return emit(instruction(opcode, dst));
210 }
211
212 /**
213 * Create and insert a unary instruction into the program.
214 */
215 instruction *
216 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
217 {
218 switch (opcode) {
219 case SHADER_OPCODE_RCP:
220 case SHADER_OPCODE_RSQ:
221 case SHADER_OPCODE_SQRT:
222 case SHADER_OPCODE_EXP2:
223 case SHADER_OPCODE_LOG2:
224 case SHADER_OPCODE_SIN:
225 case SHADER_OPCODE_COS:
226 return fix_math_instruction(
227 emit(instruction(opcode, dst,
228 fix_math_operand(src0))));
229
230 default:
231 return emit(instruction(opcode, dst, src0));
232 }
233 }
234
235 /**
236 * Create and insert a binary instruction into the program.
237 */
238 instruction *
239 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
240 const src_reg &src1) const
241 {
242 switch (opcode) {
243 case SHADER_OPCODE_POW:
244 case SHADER_OPCODE_INT_QUOTIENT:
245 case SHADER_OPCODE_INT_REMAINDER:
246 return fix_math_instruction(
247 emit(instruction(opcode, dst,
248 fix_math_operand(src0),
249 fix_math_operand(src1))));
250
251 default:
252 return emit(instruction(opcode, dst, src0, src1));
253 }
254 }
255
256 /**
257 * Create and insert a ternary instruction into the program.
258 */
259 instruction *
260 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
261 const src_reg &src1, const src_reg &src2) const
262 {
263 switch (opcode) {
264 case BRW_OPCODE_BFE:
265 case BRW_OPCODE_BFI2:
266 case BRW_OPCODE_MAD:
267 case BRW_OPCODE_LRP:
268 return emit(instruction(opcode, dst,
269 fix_3src_operand(src0),
270 fix_3src_operand(src1),
271 fix_3src_operand(src2)));
272
273 default:
274 return emit(instruction(opcode, dst, src0, src1, src2));
275 }
276 }
277
278 /**
279 * Insert a preallocated instruction into the program.
280 */
281 instruction *
282 emit(instruction *inst) const
283 {
284 inst->force_writemask_all = force_writemask_all;
285 inst->annotation = annotation.str;
286 inst->ir = annotation.ir;
287
288 if (block)
289 static_cast<instruction *>(cursor)->insert_before(block, inst);
290 else
291 cursor->insert_before(inst);
292
293 return inst;
294 }
295
296 /**
297 * Select \p src0 if the comparison of both sources with the given
298 * conditional mod evaluates to true, otherwise select \p src1.
299 *
300 * Generally useful to get the minimum or maximum of two values.
301 */
302 instruction *
303 emit_minmax(const dst_reg &dst, const src_reg &src0,
304 const src_reg &src1, brw_conditional_mod mod) const
305 {
306 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
307
308 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
309 fix_unsigned_negate(src1)));
310 }
311
312 /**
313 * Copy any live channel from \p src to the first channel of the result.
314 */
315 src_reg
316 emit_uniformize(const src_reg &src) const
317 {
318 const vec4_builder ubld = exec_all();
319 const dst_reg chan_index =
320 writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
321 const dst_reg dst = vgrf(src.type);
322
323 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
324 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
325
326 return src_reg(dst);
327 }
328
329 /**
330 * Assorted arithmetic ops.
331 * @{
332 */
333 #define ALU1(op) \
334 instruction * \
335 op(const dst_reg &dst, const src_reg &src0) const \
336 { \
337 return emit(BRW_OPCODE_##op, dst, src0); \
338 }
339
340 #define ALU2(op) \
341 instruction * \
342 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
343 { \
344 return emit(BRW_OPCODE_##op, dst, src0, src1); \
345 }
346
347 #define ALU2_ACC(op) \
348 instruction * \
349 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
350 { \
351 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
352 inst->writes_accumulator = true; \
353 return inst; \
354 }
355
356 #define ALU3(op) \
357 instruction * \
358 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
359 const src_reg &src2) const \
360 { \
361 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
362 }
363
364 ALU2(ADD)
365 ALU2_ACC(ADDC)
366 ALU2(AND)
367 ALU2(ASR)
368 ALU2(AVG)
369 ALU3(BFE)
370 ALU2(BFI1)
371 ALU3(BFI2)
372 ALU1(BFREV)
373 ALU1(CBIT)
374 ALU2(CMPN)
375 ALU3(CSEL)
376 ALU1(DIM)
377 ALU2(DP2)
378 ALU2(DP3)
379 ALU2(DP4)
380 ALU2(DPH)
381 ALU1(F16TO32)
382 ALU1(F32TO16)
383 ALU1(FBH)
384 ALU1(FBL)
385 ALU1(FRC)
386 ALU2(LINE)
387 ALU1(LZD)
388 ALU2(MAC)
389 ALU2_ACC(MACH)
390 ALU3(MAD)
391 ALU1(MOV)
392 ALU2(MUL)
393 ALU1(NOT)
394 ALU2(OR)
395 ALU2(PLN)
396 ALU1(RNDD)
397 ALU1(RNDE)
398 ALU1(RNDU)
399 ALU1(RNDZ)
400 ALU2(SAD2)
401 ALU2_ACC(SADA2)
402 ALU2(SEL)
403 ALU2(SHL)
404 ALU2(SHR)
405 ALU2_ACC(SUBB)
406 ALU2(XOR)
407
408 #undef ALU3
409 #undef ALU2_ACC
410 #undef ALU2
411 #undef ALU1
412 /** @} */
413
414 /**
415 * CMP: Sets the low bit of the destination channels with the result
416 * of the comparison, while the upper bits are undefined, and updates
417 * the flag register with the packed 16 bits of the result.
418 */
419 instruction *
420 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
421 brw_conditional_mod condition) const
422 {
423 /* Take the instruction:
424 *
425 * CMP null<d> src0<f> src1<f>
426 *
427 * Original gen4 does type conversion to the destination type
428 * before comparison, producing garbage results for floating
429 * point comparisons.
430 *
431 * The destination type doesn't matter on newer generations,
432 * so we set the type to match src0 so we can compact the
433 * instruction.
434 */
435 return set_condmod(condition,
436 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
437 fix_unsigned_negate(src0),
438 fix_unsigned_negate(src1)));
439 }
440
441 /**
442 * Gen4 predicated IF.
443 */
444 instruction *
445 IF(brw_predicate predicate) const
446 {
447 return set_predicate(predicate, emit(BRW_OPCODE_IF));
448 }
449
450 /**
451 * Gen6 IF with embedded comparison.
452 */
453 instruction *
454 IF(const src_reg &src0, const src_reg &src1,
455 brw_conditional_mod condition) const
456 {
457 assert(shader->devinfo->gen == 6);
458 return set_condmod(condition,
459 emit(BRW_OPCODE_IF,
460 null_reg_d(),
461 fix_unsigned_negate(src0),
462 fix_unsigned_negate(src1)));
463 }
464
465 /**
466 * Emit a linear interpolation instruction.
467 */
468 instruction *
469 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
470 const src_reg &a) const
471 {
472 if (shader->devinfo->gen >= 6) {
473 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
474 * we need to reorder the operands.
475 */
476 return emit(BRW_OPCODE_LRP, dst, a, y, x);
477
478 } else {
479 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
480 const dst_reg y_times_a = vgrf(dst.type);
481 const dst_reg one_minus_a = vgrf(dst.type);
482 const dst_reg x_times_one_minus_a = vgrf(dst.type);
483
484 MUL(y_times_a, y, a);
485 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
486 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
487 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
488 }
489 }
490
491 backend_shader *shader;
492
493 protected:
494 /**
495 * Workaround for negation of UD registers. See comment in
496 * fs_generator::generate_code() for the details.
497 */
498 src_reg
499 fix_unsigned_negate(const src_reg &src) const
500 {
501 if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
502 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
503 MOV(temp, src);
504 return src_reg(temp);
505 } else {
506 return src;
507 }
508 }
509
510 /**
511 * Workaround for register access modes not supported by the ternary
512 * instruction encoding.
513 */
514 src_reg
515 fix_3src_operand(const src_reg &src) const
516 {
517 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
518 * able to use vertical stride of zero to replicate the vec4 uniform, like
519 *
520 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
521 *
522 * But you can't, since vertical stride is always four in three-source
523 * instructions. Instead, insert a MOV instruction to do the replication so
524 * that the three-source instruction can consume it.
525 */
526
527 /* The MOV is only needed if the source is a uniform or immediate. */
528 if (src.file != UNIFORM && src.file != IMM)
529 return src;
530
531 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
532 return src;
533
534 const dst_reg expanded = vgrf(src.type);
535 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
536 return src_reg(expanded);
537 }
538
539 /**
540 * Workaround for register access modes not supported by the math
541 * instruction.
542 */
543 src_reg
544 fix_math_operand(const src_reg &src) const
545 {
546 /* The gen6 math instruction ignores the source modifiers --
547 * swizzle, abs, negate, and at least some parts of the register
548 * region description.
549 *
550 * Rather than trying to enumerate all these cases, *always* expand the
551 * operand to a temp GRF for gen6.
552 *
553 * For gen7, keep the operand as-is, except if immediate, which gen7 still
554 * can't use.
555 */
556 if (shader->devinfo->gen == 6 ||
557 (shader->devinfo->gen == 7 && src.file == IMM)) {
558 const dst_reg tmp = vgrf(src.type);
559 MOV(tmp, src);
560 return src_reg(tmp);
561 } else {
562 return src;
563 }
564 }
565
566 /**
567 * Workaround other weirdness of the math instruction.
568 */
569 instruction *
570 fix_math_instruction(instruction *inst) const
571 {
572 if (shader->devinfo->gen == 6 &&
573 inst->dst.writemask != WRITEMASK_XYZW) {
574 const dst_reg tmp = vgrf(inst->dst.type);
575 MOV(inst->dst, src_reg(tmp));
576 inst->dst = tmp;
577
578 } else if (shader->devinfo->gen < 6) {
579 const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
580 inst->base_mrf = 1;
581 inst->mlen = sources;
582 }
583
584 return inst;
585 }
586
587 bblock_t *block;
588 exec_node *cursor;
589
590 bool force_writemask_all;
591
592 /** Debug annotation info. */
593 struct {
594 const char *str;
595 const void *ir;
596 } annotation;
597 };
598 }
599
600 #endif