i965/nir/vec4: Implement load_const intrinsic
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_vp.cpp
1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_vec4_vp.cpp
25 *
26 * A translator from Mesa IR to the i965 driver's Vec4 IR, used to implement
27 * ARB_vertex_program and fixed-function vertex processing.
28 */
29
30 #include "brw_context.h"
31 #include "brw_vec4.h"
32 #include "brw_vs.h"
33 extern "C" {
34 #include "program/prog_parameter.h"
35 #include "program/prog_print.h"
36 }
37 using namespace brw;
38
39 void
40 vec4_visitor::emit_vp_sop(enum brw_conditional_mod conditional_mod,
41 dst_reg dst, src_reg src0, src_reg src1,
42 src_reg one)
43 {
44 vec4_instruction *inst;
45
46 inst = emit(CMP(dst_null_f(), src0, src1, conditional_mod));
47
48 inst = emit(BRW_OPCODE_SEL, dst, one, src_reg(0.0f));
49 inst->predicate = BRW_PREDICATE_NORMAL;
50 }
51
52 void
53 vec4_vs_visitor::emit_program_code()
54 {
55 this->need_all_constants_in_pull_buffer = false;
56
57 setup_vp_regs();
58
59 /* Keep a reg with 1.0 around, for reuse by emit_vs_sop so that it can just
60 * be:
61 *
62 * sel.f0 dst 1.0 0.0
63 *
64 * instead of
65 *
66 * mov dst 0.0
67 * mov.f0 dst 1.0
68 */
69 src_reg one = src_reg(this, glsl_type::float_type);
70 emit(MOV(dst_reg(one), src_reg(1.0f)));
71
72 for (unsigned int insn = 0; insn < prog->NumInstructions; insn++) {
73 const struct prog_instruction *vpi = &prog->Instructions[insn];
74 base_ir = vpi;
75
76 dst_reg dst;
77 src_reg src[3];
78
79 /* We always emit into a temporary destination register to avoid
80 * aliasing issues.
81 */
82 dst = dst_reg(this, glsl_type::vec4_type);
83
84 for (int i = 0; i < 3; i++)
85 src[i] = get_vp_src_reg(vpi->SrcReg[i]);
86
87 switch (vpi->Opcode) {
88 case OPCODE_ABS:
89 src[0].abs = true;
90 src[0].negate = false;
91 emit(MOV(dst, src[0]));
92 break;
93
94 case OPCODE_ADD:
95 emit(ADD(dst, src[0], src[1]));
96 break;
97
98 case OPCODE_ARL:
99 if (devinfo->gen >= 6) {
100 dst.writemask = WRITEMASK_X;
101 dst_reg dst_f = dst;
102 dst_f.type = BRW_REGISTER_TYPE_F;
103
104 emit(RNDD(dst_f, src[0]));
105 emit(MOV(dst, src_reg(dst_f)));
106 } else {
107 emit(RNDD(dst, src[0]));
108 }
109 break;
110
111 case OPCODE_DP3:
112 emit(DP3(dst, src[0], src[1]));
113 break;
114 case OPCODE_DP4:
115 emit(DP4(dst, src[0], src[1]));
116 break;
117 case OPCODE_DPH:
118 emit(DPH(dst, src[0], src[1]));
119 break;
120
121 case OPCODE_DST: {
122 dst_reg t = dst;
123 if (vpi->DstReg.WriteMask & WRITEMASK_X) {
124 t.writemask = WRITEMASK_X;
125 emit(MOV(t, src_reg(1.0f)));
126 }
127 if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
128 t.writemask = WRITEMASK_Y;
129 emit(MUL(t, src[0], src[1]));
130 }
131 if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
132 t.writemask = WRITEMASK_Z;
133 emit(MOV(t, src[0]));
134 }
135 if (vpi->DstReg.WriteMask & WRITEMASK_W) {
136 t.writemask = WRITEMASK_W;
137 emit(MOV(t, src[1]));
138 }
139 break;
140 }
141
142 case OPCODE_EXP: {
143 dst_reg result = dst;
144 if (vpi->DstReg.WriteMask & WRITEMASK_X) {
145 /* tmp_d = floor(src[0].x) */
146 src_reg tmp_d = src_reg(this, glsl_type::ivec4_type);
147 assert(tmp_d.type == BRW_REGISTER_TYPE_D);
148 emit(RNDD(dst_reg(tmp_d), swizzle(src[0], BRW_SWIZZLE_XXXX)));
149
150 /* result[0] = 2.0 ^ tmp */
151 /* Adjust exponent for floating point: exp += 127 */
152 dst_reg tmp_d_x(GRF, tmp_d.reg, glsl_type::int_type, WRITEMASK_X);
153 emit(ADD(tmp_d_x, tmp_d, src_reg(127)));
154
155 /* Install exponent and sign. Excess drops off the edge: */
156 dst_reg res_d_x(GRF, result.reg, glsl_type::int_type, WRITEMASK_X);
157 emit(BRW_OPCODE_SHL, res_d_x, tmp_d, src_reg(23));
158 }
159 if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
160 result.writemask = WRITEMASK_Y;
161 emit(FRC(result, src[0]));
162 }
163 if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
164 result.writemask = WRITEMASK_Z;
165 emit_math(SHADER_OPCODE_EXP2, result, src[0]);
166 }
167 if (vpi->DstReg.WriteMask & WRITEMASK_W) {
168 result.writemask = WRITEMASK_W;
169 emit(MOV(result, src_reg(1.0f)));
170 }
171 break;
172 }
173
174 case OPCODE_EX2:
175 emit_math(SHADER_OPCODE_EXP2, dst, src[0]);
176 break;
177
178 case OPCODE_FLR:
179 emit(RNDD(dst, src[0]));
180 break;
181
182 case OPCODE_FRC:
183 emit(FRC(dst, src[0]));
184 break;
185
186 case OPCODE_LG2:
187 emit_math(SHADER_OPCODE_LOG2, dst, src[0]);
188 break;
189
190 case OPCODE_LIT: {
191 dst_reg result = dst;
192 /* From the ARB_vertex_program spec:
193 *
194 * tmp = VectorLoad(op0);
195 * if (tmp.x < 0) tmp.x = 0;
196 * if (tmp.y < 0) tmp.y = 0;
197 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
198 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
199 * result.x = 1.0;
200 * result.y = tmp.x;
201 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
202 * result.w = 1.0;
203 *
204 * Note that we don't do the clamping to +/- 128. We didn't in
205 * brw_vs_emit.c either.
206 */
207 if (vpi->DstReg.WriteMask & WRITEMASK_XW) {
208 result.writemask = WRITEMASK_XW;
209 emit(MOV(result, src_reg(1.0f)));
210 }
211 if (vpi->DstReg.WriteMask & WRITEMASK_YZ) {
212 result.writemask = WRITEMASK_YZ;
213 emit(MOV(result, src_reg(0.0f)));
214
215 src_reg tmp_x = swizzle(src[0], BRW_SWIZZLE_XXXX);
216
217 emit(CMP(dst_null_d(), tmp_x, src_reg(0.0f), BRW_CONDITIONAL_G));
218 emit(IF(BRW_PREDICATE_NORMAL));
219
220 if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
221 result.writemask = WRITEMASK_Y;
222 emit(MOV(result, tmp_x));
223 }
224
225 if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
226 /* if (tmp.y < 0) tmp.y = 0; */
227 src_reg tmp_y = swizzle(src[0], BRW_SWIZZLE_YYYY);
228 result.writemask = WRITEMASK_Z;
229 emit_minmax(BRW_CONDITIONAL_GE, result, tmp_y, src_reg(0.0f));
230
231 src_reg clamped_y(result);
232 clamped_y.swizzle = BRW_SWIZZLE_ZZZZ;
233
234 src_reg tmp_w = swizzle(src[0], BRW_SWIZZLE_WWWW);
235
236 emit_math(SHADER_OPCODE_POW, result, clamped_y, tmp_w);
237 }
238 emit(BRW_OPCODE_ENDIF);
239 }
240 break;
241 }
242
243 case OPCODE_LOG: {
244 dst_reg result = dst;
245 result.type = BRW_REGISTER_TYPE_UD;
246 src_reg result_src = src_reg(result);
247
248 src_reg arg0_ud = swizzle(src[0], BRW_SWIZZLE_XXXX);
249 arg0_ud.type = BRW_REGISTER_TYPE_UD;
250
251 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
252 * according to spec:
253 *
254 * These almost look likey they could be joined up, but not really
255 * practical:
256 *
257 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
258 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
259 */
260 if (vpi->DstReg.WriteMask & WRITEMASK_XZ) {
261 result.writemask = WRITEMASK_X;
262 emit(AND(result, arg0_ud, src_reg((1u << 31) - 1)));
263 emit(BRW_OPCODE_SHR, result, result_src, src_reg(23u));
264 src_reg result_d(result_src);
265 result_d.type = BRW_REGISTER_TYPE_D; /* does it matter? */
266 result.type = BRW_REGISTER_TYPE_F;
267 emit(ADD(result, result_d, src_reg(-127)));
268 }
269
270 if (vpi->DstReg.WriteMask & WRITEMASK_YZ) {
271 result.writemask = WRITEMASK_Y;
272 result.type = BRW_REGISTER_TYPE_UD;
273 emit(AND(result, arg0_ud, src_reg((1u << 23) - 1)));
274 emit(OR(result, result_src, src_reg(127u << 23)));
275 }
276
277 if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
278 /* result[2] = result[0] + LOG2(result[1]); */
279
280 /* Why bother? The above is just a hint how to do this with a
281 * taylor series. Maybe we *should* use a taylor series as by
282 * the time all the above has been done it's almost certainly
283 * quicker than calling the mathbox, even with low precision.
284 *
285 * Options are:
286 * - result[0] + mathbox.LOG2(result[1])
287 * - mathbox.LOG2(arg0.x)
288 * - result[0] + inline_taylor_approx(result[1])
289 */
290 result.type = BRW_REGISTER_TYPE_F;
291 result.writemask = WRITEMASK_Z;
292 src_reg result_x(result), result_y(result), result_z(result);
293 result_x.swizzle = BRW_SWIZZLE_XXXX;
294 result_y.swizzle = BRW_SWIZZLE_YYYY;
295 result_z.swizzle = BRW_SWIZZLE_ZZZZ;
296 emit_math(SHADER_OPCODE_LOG2, result, result_y);
297 emit(ADD(result, result_z, result_x));
298 }
299
300 if (vpi->DstReg.WriteMask & WRITEMASK_W) {
301 result.type = BRW_REGISTER_TYPE_F;
302 result.writemask = WRITEMASK_W;
303 emit(MOV(result, src_reg(1.0f)));
304 }
305 break;
306 }
307
308 case OPCODE_MAD: {
309 src_reg temp = src_reg(this, glsl_type::vec4_type);
310 emit(MUL(dst_reg(temp), src[0], src[1]));
311 emit(ADD(dst, temp, src[2]));
312 break;
313 }
314
315 case OPCODE_MAX:
316 emit_minmax(BRW_CONDITIONAL_GE, dst, src[0], src[1]);
317 break;
318
319 case OPCODE_MIN:
320 emit_minmax(BRW_CONDITIONAL_L, dst, src[0], src[1]);
321 break;
322
323 case OPCODE_MOV:
324 emit(MOV(dst, src[0]));
325 break;
326
327 case OPCODE_MUL:
328 emit(MUL(dst, src[0], src[1]));
329 break;
330
331 case OPCODE_POW:
332 emit_math(SHADER_OPCODE_POW, dst, src[0], src[1]);
333 break;
334
335 case OPCODE_RCP:
336 emit_math(SHADER_OPCODE_RCP, dst, src[0]);
337 break;
338
339 case OPCODE_RSQ:
340 emit_math(SHADER_OPCODE_RSQ, dst, src[0]);
341 break;
342
343 case OPCODE_SGE:
344 emit_vp_sop(BRW_CONDITIONAL_GE, dst, src[0], src[1], one);
345 break;
346
347 case OPCODE_SLT:
348 emit_vp_sop(BRW_CONDITIONAL_L, dst, src[0], src[1], one);
349 break;
350
351 case OPCODE_SUB: {
352 src_reg neg_src1 = src[1];
353 neg_src1.negate = !src[1].negate;
354 emit(ADD(dst, src[0], neg_src1));
355 break;
356 }
357
358 case OPCODE_SWZ:
359 /* Note that SWZ's extended swizzles are handled in the general
360 * get_src_reg() code.
361 */
362 emit(MOV(dst, src[0]));
363 break;
364
365 case OPCODE_XPD: {
366 src_reg t1 = src_reg(this, glsl_type::vec4_type);
367 src_reg t2 = src_reg(this, glsl_type::vec4_type);
368
369 emit(MUL(dst_reg(t1),
370 swizzle(src[0], BRW_SWIZZLE_YZXW),
371 swizzle(src[1], BRW_SWIZZLE_ZXYW)));
372 emit(MUL(dst_reg(t2),
373 swizzle(src[0], BRW_SWIZZLE_ZXYW),
374 swizzle(src[1], BRW_SWIZZLE_YZXW)));
375 t2.negate = true;
376 emit(ADD(dst, t1, t2));
377 break;
378 }
379
380 case OPCODE_END:
381 break;
382
383 default:
384 assert(!"Unsupported opcode in vertex program");
385 }
386
387 /* Copy the temporary back into the actual destination register. */
388 if (_mesa_num_inst_dst_regs(vpi->Opcode) != 0) {
389 emit(MOV(get_vp_dst_reg(vpi->DstReg), src_reg(dst)));
390 }
391 }
392
393 /* If we used relative addressing, we need to upload all constants as
394 * pull constants. Do that now.
395 */
396 if (this->need_all_constants_in_pull_buffer) {
397 const struct gl_program_parameter_list *params = vp->Base.Parameters;
398 unsigned i;
399 for (i = 0; i < params->NumParameters * 4; i++) {
400 stage_prog_data->pull_param[i] =
401 &params->ParameterValues[i / 4][i % 4];
402 }
403 stage_prog_data->nr_pull_params = i;
404 }
405 }
406
407 void
408 vec4_vs_visitor::setup_vp_regs()
409 {
410 /* PROGRAM_TEMPORARY */
411 int num_temp = prog->NumTemporaries;
412 vp_temp_regs = rzalloc_array(mem_ctx, src_reg, num_temp);
413 for (int i = 0; i < num_temp; i++)
414 vp_temp_regs[i] = src_reg(this, glsl_type::vec4_type);
415
416 /* PROGRAM_STATE_VAR etc. */
417 struct gl_program_parameter_list *plist = vp->Base.Parameters;
418 for (unsigned p = 0; p < plist->NumParameters; p++) {
419 unsigned components = plist->Parameters[p].Size;
420
421 /* Parameters should be either vec4 uniforms or single component
422 * constants; matrices and other larger types should have been broken
423 * down earlier.
424 */
425 assert(components <= 4);
426
427 this->uniform_size[this->uniforms] = 1; /* 1 vec4 */
428 this->uniform_vector_size[this->uniforms] = components;
429 for (unsigned i = 0; i < 4; i++) {
430 stage_prog_data->param[this->uniforms * 4 + i] = i >= components
431 ? 0 : &plist->ParameterValues[p][i];
432 }
433 this->uniforms++; /* counted in vec4 units */
434 }
435
436 /* PROGRAM_OUTPUT */
437 for (int slot = 0; slot < prog_data->vue_map.num_slots; slot++) {
438 int varying = prog_data->vue_map.slot_to_varying[slot];
439 if (varying == VARYING_SLOT_PSIZ)
440 output_reg[varying] = dst_reg(this, glsl_type::float_type);
441 else
442 output_reg[varying] = dst_reg(this, glsl_type::vec4_type);
443 assert(output_reg[varying].type == BRW_REGISTER_TYPE_F);
444 }
445
446 /* PROGRAM_ADDRESS */
447 this->vp_addr_reg = src_reg(this, glsl_type::int_type);
448 assert(this->vp_addr_reg.type == BRW_REGISTER_TYPE_D);
449 }
450
451 dst_reg
452 vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
453 {
454 dst_reg result;
455
456 assert(!dst.RelAddr);
457
458 switch (dst.File) {
459 case PROGRAM_TEMPORARY:
460 result = dst_reg(vp_temp_regs[dst.Index]);
461 break;
462
463 case PROGRAM_OUTPUT:
464 result = output_reg[dst.Index];
465 break;
466
467 case PROGRAM_ADDRESS: {
468 assert(dst.Index == 0);
469 result = dst_reg(this->vp_addr_reg);
470 break;
471 }
472
473 case PROGRAM_UNDEFINED:
474 return dst_null_f();
475
476 default:
477 unreachable("vec4_vp: bad destination register file");
478 }
479
480 result.writemask = dst.WriteMask;
481 return result;
482 }
483
484 src_reg
485 vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
486 {
487 struct gl_program_parameter_list *plist = vp->Base.Parameters;
488
489 src_reg result;
490
491 assert(!src.Abs);
492
493 switch (src.File) {
494 case PROGRAM_UNDEFINED:
495 return src_reg(brw_null_reg());
496
497 case PROGRAM_TEMPORARY:
498 result = vp_temp_regs[src.Index];
499 break;
500
501 case PROGRAM_INPUT:
502 result = src_reg(ATTR, src.Index, glsl_type::vec4_type);
503 result.type = BRW_REGISTER_TYPE_F;
504 break;
505
506 case PROGRAM_ADDRESS: {
507 assert(src.Index == 0);
508 result = this->vp_addr_reg;
509 break;
510 }
511
512 case PROGRAM_STATE_VAR:
513 case PROGRAM_CONSTANT:
514 /* From the ARB_vertex_program specification:
515 * "Relative addressing can only be used for accessing program
516 * parameter arrays."
517 */
518 if (src.RelAddr) {
519 /* Since we have no idea what the base of the array is, we need to
520 * upload ALL constants as push constants.
521 */
522 this->need_all_constants_in_pull_buffer = true;
523
524 /* Add the small constant index to the address register */
525 src_reg reladdr = src_reg(this, glsl_type::int_type);
526
527 dst_reg dst_reladdr = dst_reg(reladdr);
528 dst_reladdr.writemask = WRITEMASK_X;
529 emit(ADD(dst_reladdr, this->vp_addr_reg, src_reg(src.Index)));
530
531 if (devinfo->gen < 6)
532 emit(MUL(dst_reladdr, reladdr, src_reg(16)));
533
534 #if 0
535 assert(src.Index < this->uniforms);
536 result = src_reg(dst_reg(UNIFORM, 0));
537 result.type = BRW_REGISTER_TYPE_F;
538 result.reladdr = new(mem_ctx) src_reg();
539 memcpy(result.reladdr, &reladdr, sizeof(src_reg));
540 #endif
541
542 result = src_reg(this, glsl_type::vec4_type);
543 src_reg surf_index = src_reg(unsigned(prog_data->base.binding_table.pull_constants_start));
544
545 emit_pull_constant_load_reg(dst_reg(result),
546 surf_index,
547 reladdr,
548 NULL, NULL /* before_block/inst */);
549 break;
550 }
551
552 /* We actually want to look at the type in the Parameters list for this,
553 * because this lets us upload constant builtin uniforms as actual
554 * constants.
555 */
556 switch (plist->Parameters[src.Index].Type) {
557 case PROGRAM_CONSTANT:
558 result = src_reg(this, glsl_type::vec4_type);
559 for (int i = 0; i < 4; i++) {
560 dst_reg t = dst_reg(result);
561 t.writemask = 1 << i;
562 emit(MOV(t, src_reg(plist->ParameterValues[src.Index][i].f)));
563 }
564 break;
565
566 case PROGRAM_STATE_VAR:
567 assert(src.Index < this->uniforms);
568 result = src_reg(dst_reg(UNIFORM, src.Index));
569 result.type = BRW_REGISTER_TYPE_F;
570 break;
571
572 default:
573 assert(!"Bad uniform in src register file");
574 return src_reg(this, glsl_type::vec4_type);
575 }
576 break;
577
578 default:
579 assert(!"Bad src register file");
580 return src_reg(this, glsl_type::vec4_type);
581 }
582
583 if (src.Swizzle != SWIZZLE_NOOP || src.Negate) {
584 unsigned short zeros_mask = 0;
585 unsigned short ones_mask = 0;
586 unsigned short src_mask = 0;
587 unsigned short src_swiz[4];
588
589 for (int i = 0; i < 4; i++) {
590 src_swiz[i] = 0; /* initialize for safety */
591
592 /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
593 * but it's simplest to handle it here.
594 */
595 int s = GET_SWZ(src.Swizzle, i);
596 switch (s) {
597 case SWIZZLE_X:
598 case SWIZZLE_Y:
599 case SWIZZLE_Z:
600 case SWIZZLE_W:
601 src_mask |= 1 << i;
602 src_swiz[i] = s;
603 break;
604 case SWIZZLE_ZERO:
605 zeros_mask |= 1 << i;
606 break;
607 case SWIZZLE_ONE:
608 ones_mask |= 1 << i;
609 break;
610 }
611 }
612
613 result.swizzle =
614 BRW_SWIZZLE4(src_swiz[0], src_swiz[1], src_swiz[2], src_swiz[3]);
615
616 /* The hardware doesn't natively handle the SWZ instruction's zero/one
617 * swizzles or per-component negation, so we need to use a temporary.
618 */
619 if (zeros_mask || ones_mask || src.Negate) {
620 src_reg temp_src(this, glsl_type::vec4_type);
621 dst_reg temp(temp_src);
622
623 if (src_mask) {
624 temp.writemask = src_mask;
625 emit(MOV(temp, result));
626 }
627
628 if (zeros_mask) {
629 temp.writemask = zeros_mask;
630 emit(MOV(temp, src_reg(0.0f)));
631 }
632
633 if (ones_mask) {
634 temp.writemask = ones_mask;
635 emit(MOV(temp, src_reg(1.0f)));
636 }
637
638 if (src.Negate) {
639 temp.writemask = src.Negate;
640 src_reg neg(temp_src);
641 neg.negate = true;
642 emit(MOV(temp, neg));
643 }
644 result = temp_src;
645 }
646 }
647
648 return result;
649 }