2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_vec4_vp.cpp
26 * A translator from Mesa IR to the i965 driver's Vec4 IR, used to implement
27 * ARB_vertex_program and fixed-function vertex processing.
30 #include "brw_context.h"
34 #include "program/prog_parameter.h"
35 #include "program/prog_print.h"
40 vec4_visitor::emit_vp_sop(uint32_t conditional_mod
,
41 dst_reg dst
, src_reg src0
, src_reg src1
,
44 vec4_instruction
*inst
;
46 inst
= emit(BRW_OPCODE_CMP
, dst_null_d(), src0
, src1
);
47 inst
->conditional_mod
= conditional_mod
;
49 inst
= emit(BRW_OPCODE_SEL
, dst
, one
, src_reg(0.0f
));
50 inst
->predicate
= BRW_PREDICATE_NORMAL
;
54 * Reswizzle a given source register.
58 reswizzle(src_reg orig
, unsigned x
, unsigned y
, unsigned z
, unsigned w
)
61 t
.swizzle
= BRW_SWIZZLE4(BRW_GET_SWZ(orig
.swizzle
, x
),
62 BRW_GET_SWZ(orig
.swizzle
, y
),
63 BRW_GET_SWZ(orig
.swizzle
, z
),
64 BRW_GET_SWZ(orig
.swizzle
, w
));
69 vec4_vs_visitor::emit_program_code()
71 this->need_all_constants_in_pull_buffer
= false;
75 /* Keep a reg with 1.0 around, for reuse by emit_vs_sop so that it can just
85 src_reg one
= src_reg(this, glsl_type::float_type
);
86 emit(MOV(dst_reg(one
), src_reg(1.0f
)));
88 for (unsigned int insn
= 0; insn
< prog
->NumInstructions
; insn
++) {
89 const struct prog_instruction
*vpi
= &prog
->Instructions
[insn
];
95 /* We always emit into a temporary destination register to avoid
98 dst
= dst_reg(this, glsl_type::vec4_type
);
100 for (int i
= 0; i
< 3; i
++)
101 src
[i
] = get_vp_src_reg(vpi
->SrcReg
[i
]);
103 switch (vpi
->Opcode
) {
106 src
[0].negate
= false;
107 emit(MOV(dst
, src
[0]));
111 emit(ADD(dst
, src
[0], src
[1]));
116 dst
.writemask
= WRITEMASK_X
;
118 dst_f
.type
= BRW_REGISTER_TYPE_F
;
120 emit(RNDD(dst_f
, src
[0]));
121 emit(MOV(dst
, src_reg(dst_f
)));
123 emit(RNDD(dst
, src
[0]));
128 emit(DP3(dst
, src
[0], src
[1]));
131 emit(DP4(dst
, src
[0], src
[1]));
134 emit(DPH(dst
, src
[0], src
[1]));
139 if (vpi
->DstReg
.WriteMask
& WRITEMASK_X
) {
140 t
.writemask
= WRITEMASK_X
;
141 emit(MOV(t
, src_reg(1.0f
)));
143 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
144 t
.writemask
= WRITEMASK_Y
;
145 emit(MUL(t
, src
[0], src
[1]));
147 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
148 t
.writemask
= WRITEMASK_Z
;
149 emit(MOV(t
, src
[0]));
151 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
152 t
.writemask
= WRITEMASK_W
;
153 emit(MOV(t
, src
[1]));
159 dst_reg result
= dst
;
160 if (vpi
->DstReg
.WriteMask
& WRITEMASK_X
) {
161 /* tmp_d = floor(src[0].x) */
162 src_reg tmp_d
= src_reg(this, glsl_type::ivec4_type
);
163 assert(tmp_d
.type
== BRW_REGISTER_TYPE_D
);
164 emit(RNDD(dst_reg(tmp_d
), reswizzle(src
[0], 0, 0, 0, 0)));
166 /* result[0] = 2.0 ^ tmp */
167 /* Adjust exponent for floating point: exp += 127 */
168 dst_reg
tmp_d_x(GRF
, tmp_d
.reg
, glsl_type::int_type
, WRITEMASK_X
);
169 emit(ADD(tmp_d_x
, tmp_d
, src_reg(127)));
171 /* Install exponent and sign. Excess drops off the edge: */
172 dst_reg
res_d_x(GRF
, result
.reg
, glsl_type::int_type
, WRITEMASK_X
);
173 emit(BRW_OPCODE_SHL
, res_d_x
, tmp_d
, src_reg(23));
175 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
176 result
.writemask
= WRITEMASK_Y
;
177 emit(FRC(result
, src
[0]));
179 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
180 result
.writemask
= WRITEMASK_Z
;
181 emit_math(SHADER_OPCODE_EXP2
, result
, src
[0]);
183 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
184 result
.writemask
= WRITEMASK_W
;
185 emit(MOV(result
, src_reg(1.0f
)));
191 emit_math(SHADER_OPCODE_EXP2
, dst
, src
[0]);
195 emit(RNDD(dst
, src
[0]));
199 emit(FRC(dst
, src
[0]));
203 emit_math(SHADER_OPCODE_LOG2
, dst
, src
[0]);
207 dst_reg result
= dst
;
208 /* From the ARB_vertex_program spec:
210 * tmp = VectorLoad(op0);
211 * if (tmp.x < 0) tmp.x = 0;
212 * if (tmp.y < 0) tmp.y = 0;
213 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
214 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
217 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
220 * Note that we don't do the clamping to +/- 128. We didn't in
221 * brw_vs_emit.c either.
223 if (vpi
->DstReg
.WriteMask
& WRITEMASK_XW
) {
224 result
.writemask
= WRITEMASK_XW
;
225 emit(MOV(result
, src_reg(1.0f
)));
227 if (vpi
->DstReg
.WriteMask
& WRITEMASK_YZ
) {
228 result
.writemask
= WRITEMASK_YZ
;
229 emit(MOV(result
, src_reg(0.0f
)));
231 src_reg tmp_x
= reswizzle(src
[0], 0, 0, 0, 0);
233 emit(CMP(dst_null_d(), tmp_x
, src_reg(0.0f
), BRW_CONDITIONAL_G
));
234 emit(IF(BRW_PREDICATE_NORMAL
));
236 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
237 result
.writemask
= WRITEMASK_Y
;
238 emit(MOV(result
, tmp_x
));
241 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
242 /* if (tmp.y < 0) tmp.y = 0; */
243 src_reg tmp_y
= reswizzle(src
[0], 1, 1, 1, 1);
244 result
.writemask
= WRITEMASK_Z
;
245 emit_minmax(BRW_CONDITIONAL_G
, result
, tmp_y
, src_reg(0.0f
));
247 src_reg
clamped_y(result
);
248 clamped_y
.swizzle
= BRW_SWIZZLE_ZZZZ
;
250 src_reg tmp_w
= reswizzle(src
[0], 3, 3, 3, 3);
252 emit_math(SHADER_OPCODE_POW
, result
, clamped_y
, tmp_w
);
254 emit(BRW_OPCODE_ENDIF
);
260 dst_reg result
= dst
;
261 result
.type
= BRW_REGISTER_TYPE_UD
;
262 src_reg result_src
= src_reg(result
);
264 src_reg arg0_ud
= reswizzle(src
[0], 0, 0, 0, 0);
265 arg0_ud
.type
= BRW_REGISTER_TYPE_UD
;
267 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
270 * These almost look likey they could be joined up, but not really
273 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
274 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
276 if (vpi
->DstReg
.WriteMask
& WRITEMASK_XZ
) {
277 result
.writemask
= WRITEMASK_X
;
278 emit(AND(result
, arg0_ud
, src_reg((1u << 31) - 1)));
279 emit(BRW_OPCODE_SHR
, result
, result_src
, src_reg(23u));
280 src_reg
result_d(result_src
);
281 result_d
.type
= BRW_REGISTER_TYPE_D
; /* does it matter? */
282 result
.type
= BRW_REGISTER_TYPE_F
;
283 emit(ADD(result
, result_d
, src_reg(-127)));
286 if (vpi
->DstReg
.WriteMask
& WRITEMASK_YZ
) {
287 result
.writemask
= WRITEMASK_Y
;
288 result
.type
= BRW_REGISTER_TYPE_UD
;
289 emit(AND(result
, arg0_ud
, src_reg((1u << 23) - 1)));
290 emit(OR(result
, result_src
, src_reg(127u << 23)));
293 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
294 /* result[2] = result[0] + LOG2(result[1]); */
296 /* Why bother? The above is just a hint how to do this with a
297 * taylor series. Maybe we *should* use a taylor series as by
298 * the time all the above has been done it's almost certainly
299 * quicker than calling the mathbox, even with low precision.
302 * - result[0] + mathbox.LOG2(result[1])
303 * - mathbox.LOG2(arg0.x)
304 * - result[0] + inline_taylor_approx(result[1])
306 result
.type
= BRW_REGISTER_TYPE_F
;
307 result
.writemask
= WRITEMASK_Z
;
308 src_reg
result_x(result
), result_y(result
), result_z(result
);
309 result_x
.swizzle
= BRW_SWIZZLE_XXXX
;
310 result_y
.swizzle
= BRW_SWIZZLE_YYYY
;
311 result_z
.swizzle
= BRW_SWIZZLE_ZZZZ
;
312 emit_math(SHADER_OPCODE_LOG2
, result
, result_y
);
313 emit(ADD(result
, result_z
, result_x
));
316 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
317 result
.type
= BRW_REGISTER_TYPE_F
;
318 result
.writemask
= WRITEMASK_W
;
319 emit(MOV(result
, src_reg(1.0f
)));
325 src_reg temp
= src_reg(this, glsl_type::vec4_type
);
326 emit(MUL(dst_reg(temp
), src
[0], src
[1]));
327 emit(ADD(dst
, temp
, src
[2]));
332 emit_minmax(BRW_CONDITIONAL_G
, dst
, src
[0], src
[1]);
336 emit_minmax(BRW_CONDITIONAL_L
, dst
, src
[0], src
[1]);
340 emit(MOV(dst
, src
[0]));
344 emit(MUL(dst
, src
[0], src
[1]));
348 emit_math(SHADER_OPCODE_POW
, dst
, src
[0], src
[1]);
352 emit_math(SHADER_OPCODE_RCP
, dst
, src
[0]);
356 emit_math(SHADER_OPCODE_RSQ
, dst
, src
[0]);
360 emit_vp_sop(BRW_CONDITIONAL_GE
, dst
, src
[0], src
[1], one
);
364 emit_vp_sop(BRW_CONDITIONAL_L
, dst
, src
[0], src
[1], one
);
368 src_reg neg_src1
= src
[1];
369 neg_src1
.negate
= !src
[1].negate
;
370 emit(ADD(dst
, src
[0], neg_src1
));
375 /* Note that SWZ's extended swizzles are handled in the general
376 * get_src_reg() code.
378 emit(MOV(dst
, src
[0]));
382 src_reg t1
= src_reg(this, glsl_type::vec4_type
);
383 src_reg t2
= src_reg(this, glsl_type::vec4_type
);
385 emit(MUL(dst_reg(t1
),
386 reswizzle(src
[0], 1, 2, 0, 3),
387 reswizzle(src
[1], 2, 0, 1, 3)));
388 emit(MUL(dst_reg(t2
),
389 reswizzle(src
[0], 2, 0, 1, 3),
390 reswizzle(src
[1], 1, 2, 0, 3)));
392 emit(ADD(dst
, t1
, t2
));
400 _mesa_problem(ctx
, "Unsupported opcode %s in vertex program\n",
401 _mesa_opcode_string(vpi
->Opcode
));
404 /* Copy the temporary back into the actual destination register. */
405 if (vpi
->Opcode
!= OPCODE_END
) {
406 emit(MOV(get_vp_dst_reg(vpi
->DstReg
), src_reg(dst
)));
410 /* If we used relative addressing, we need to upload all constants as
411 * pull constants. Do that now.
413 if (this->need_all_constants_in_pull_buffer
) {
414 const struct gl_program_parameter_list
*params
=
415 vs_compile
->vp
->program
.Base
.Parameters
;
417 for (i
= 0; i
< params
->NumParameters
* 4; i
++) {
418 prog_data
->pull_param
[i
] =
419 ¶ms
->ParameterValues
[i
/ 4][i
% 4].f
;
421 prog_data
->nr_pull_params
= i
;
426 vec4_vs_visitor::setup_vp_regs()
428 /* PROGRAM_TEMPORARY */
429 int num_temp
= prog
->NumTemporaries
;
430 vp_temp_regs
= rzalloc_array(mem_ctx
, src_reg
, num_temp
);
431 for (int i
= 0; i
< num_temp
; i
++)
432 vp_temp_regs
[i
] = src_reg(this, glsl_type::vec4_type
);
434 /* PROGRAM_STATE_VAR etc. */
435 struct gl_program_parameter_list
*plist
=
436 vs_compile
->vp
->program
.Base
.Parameters
;
437 for (unsigned p
= 0; p
< plist
->NumParameters
; p
++) {
438 unsigned components
= plist
->Parameters
[p
].Size
;
440 /* Parameters should be either vec4 uniforms or single component
441 * constants; matrices and other larger types should have been broken
444 assert(components
<= 4);
446 this->uniform_size
[this->uniforms
] = 1; /* 1 vec4 */
447 this->uniform_vector_size
[this->uniforms
] = components
;
448 for (unsigned i
= 0; i
< 4; i
++) {
449 prog_data
->param
[this->uniforms
* 4 + i
] = i
>= components
450 ? 0 : &plist
->ParameterValues
[p
][i
].f
;
452 this->uniforms
++; /* counted in vec4 units */
456 for (int slot
= 0; slot
< prog_data
->vue_map
.num_slots
; slot
++) {
457 int varying
= prog_data
->vue_map
.slot_to_varying
[slot
];
458 if (varying
== VARYING_SLOT_PSIZ
)
459 output_reg
[varying
] = dst_reg(this, glsl_type::float_type
);
461 output_reg
[varying
] = dst_reg(this, glsl_type::vec4_type
);
462 assert(output_reg
[varying
].type
== BRW_REGISTER_TYPE_F
);
465 /* PROGRAM_ADDRESS */
466 this->vp_addr_reg
= src_reg(this, glsl_type::int_type
);
467 assert(this->vp_addr_reg
.type
== BRW_REGISTER_TYPE_D
);
471 vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register
&dst
)
475 assert(!dst
.RelAddr
);
478 case PROGRAM_TEMPORARY
:
479 result
= dst_reg(vp_temp_regs
[dst
.Index
]);
483 result
= output_reg
[dst
.Index
];
486 case PROGRAM_ADDRESS
: {
487 assert(dst
.Index
== 0);
488 result
= dst_reg(this->vp_addr_reg
);
492 case PROGRAM_UNDEFINED
:
496 assert("vec4_vp: bad destination register file");
497 return dst_reg(this, glsl_type::vec4_type
);
500 result
.writemask
= dst
.WriteMask
;
505 vec4_vs_visitor::get_vp_src_reg(const prog_src_register
&src
)
507 struct gl_program_parameter_list
*plist
=
508 vs_compile
->vp
->program
.Base
.Parameters
;
515 case PROGRAM_UNDEFINED
:
516 return src_reg(brw_null_reg());
518 case PROGRAM_TEMPORARY
:
519 result
= vp_temp_regs
[src
.Index
];
523 result
= src_reg(ATTR
, src
.Index
, glsl_type::vec4_type
);
524 result
.type
= BRW_REGISTER_TYPE_F
;
527 case PROGRAM_ADDRESS
: {
528 assert(src
.Index
== 0);
529 result
= this->vp_addr_reg
;
533 case PROGRAM_STATE_VAR
:
534 case PROGRAM_CONSTANT
:
535 /* From the ARB_vertex_program specification:
536 * "Relative addressing can only be used for accessing program
540 /* Since we have no idea what the base of the array is, we need to
541 * upload ALL constants as push constants.
543 this->need_all_constants_in_pull_buffer
= true;
545 /* Add the small constant index to the address register */
546 src_reg reladdr
= src_reg(this, glsl_type::int_type
);
547 dst_reg dst_reladdr
= dst_reg(reladdr
);
548 dst_reladdr
.writemask
= WRITEMASK_X
;
549 emit(ADD(dst_reladdr
, this->vp_addr_reg
, src_reg(src
.Index
)));
552 emit(MUL(dst_reladdr
, reladdr
, src_reg(16)));
555 assert(src
.Index
< this->uniforms
);
556 result
= src_reg(dst_reg(UNIFORM
, 0));
557 result
.type
= BRW_REGISTER_TYPE_F
;
558 result
.reladdr
= new(mem_ctx
) src_reg();
559 memcpy(result
.reladdr
, &reladdr
, sizeof(src_reg
));
562 result
= src_reg(this, glsl_type::vec4_type
);
563 src_reg surf_index
= src_reg(unsigned(prog_data
->base
.binding_table
.pull_constants_start
));
564 vec4_instruction
*load
=
565 new(mem_ctx
) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD
,
566 dst_reg(result
), surf_index
, reladdr
);
573 /* We actually want to look at the type in the Parameters list for this,
574 * because this lets us upload constant builtin uniforms as actual
577 switch (plist
->Parameters
[src
.Index
].Type
) {
578 case PROGRAM_CONSTANT
:
579 result
= src_reg(this, glsl_type::vec4_type
);
580 for (int i
= 0; i
< 4; i
++) {
581 dst_reg t
= dst_reg(result
);
582 t
.writemask
= 1 << i
;
583 emit(MOV(t
, src_reg(plist
->ParameterValues
[src
.Index
][i
].f
)));
587 case PROGRAM_STATE_VAR
:
588 assert(src
.Index
< this->uniforms
);
589 result
= src_reg(dst_reg(UNIFORM
, src
.Index
));
590 result
.type
= BRW_REGISTER_TYPE_F
;
594 _mesa_problem(ctx
, "bad uniform src register file: %s\n",
595 _mesa_register_file_name((gl_register_file
)src
.File
));
596 return src_reg(this, glsl_type::vec4_type
);
601 _mesa_problem(ctx
, "bad src register file: %s\n",
602 _mesa_register_file_name((gl_register_file
)src
.File
));
603 return src_reg(this, glsl_type::vec4_type
);
606 if (src
.Swizzle
!= SWIZZLE_NOOP
|| src
.Negate
) {
607 unsigned short zeros_mask
= 0;
608 unsigned short ones_mask
= 0;
609 unsigned short src_mask
= 0;
610 unsigned short src_swiz
[4];
612 for (int i
= 0; i
< 4; i
++) {
613 src_swiz
[i
] = 0; /* initialize for safety */
615 /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
616 * but it's simplest to handle it here.
618 int s
= GET_SWZ(src
.Swizzle
, i
);
628 zeros_mask
|= 1 << i
;
637 BRW_SWIZZLE4(src_swiz
[0], src_swiz
[1], src_swiz
[2], src_swiz
[3]);
639 /* The hardware doesn't natively handle the SWZ instruction's zero/one
640 * swizzles or per-component negation, so we need to use a temporary.
642 if (zeros_mask
|| ones_mask
|| src
.Negate
) {
643 src_reg
temp_src(this, glsl_type::vec4_type
);
644 dst_reg
temp(temp_src
);
647 temp
.writemask
= src_mask
;
648 emit(MOV(temp
, result
));
652 temp
.writemask
= zeros_mask
;
653 emit(MOV(temp
, src_reg(0.0f
)));
657 temp
.writemask
= ones_mask
;
658 emit(MOV(temp
, src_reg(1.0f
)));
662 temp
.writemask
= src
.Negate
;
663 src_reg
neg(temp_src
);
665 emit(MOV(temp
, neg
));