2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_vec4_vp.cpp
26 * A translator from Mesa IR to the i965 driver's Vec4 IR, used to implement
27 * ARB_vertex_program and fixed-function vertex processing.
30 #include "brw_context.h"
34 #include "program/prog_parameter.h"
35 #include "program/prog_print.h"
40 vec4_visitor::emit_vp_sop(enum brw_conditional_mod conditional_mod
,
41 dst_reg dst
, src_reg src0
, src_reg src1
,
44 vec4_instruction
*inst
;
46 inst
= emit(BRW_OPCODE_CMP
, dst_null_d(), src0
, src1
);
47 inst
->conditional_mod
= conditional_mod
;
49 inst
= emit(BRW_OPCODE_SEL
, dst
, one
, src_reg(0.0f
));
50 inst
->predicate
= BRW_PREDICATE_NORMAL
;
54 vec4_vs_visitor::emit_program_code()
56 this->need_all_constants_in_pull_buffer
= false;
60 /* Keep a reg with 1.0 around, for reuse by emit_vs_sop so that it can just
70 src_reg one
= src_reg(this, glsl_type::float_type
);
71 emit(MOV(dst_reg(one
), src_reg(1.0f
)));
73 for (unsigned int insn
= 0; insn
< prog
->NumInstructions
; insn
++) {
74 const struct prog_instruction
*vpi
= &prog
->Instructions
[insn
];
80 /* We always emit into a temporary destination register to avoid
83 dst
= dst_reg(this, glsl_type::vec4_type
);
85 for (int i
= 0; i
< 3; i
++)
86 src
[i
] = get_vp_src_reg(vpi
->SrcReg
[i
]);
88 switch (vpi
->Opcode
) {
91 src
[0].negate
= false;
92 emit(MOV(dst
, src
[0]));
96 emit(ADD(dst
, src
[0], src
[1]));
101 dst
.writemask
= WRITEMASK_X
;
103 dst_f
.type
= BRW_REGISTER_TYPE_F
;
105 emit(RNDD(dst_f
, src
[0]));
106 emit(MOV(dst
, src_reg(dst_f
)));
108 emit(RNDD(dst
, src
[0]));
113 emit(DP3(dst
, src
[0], src
[1]));
116 emit(DP4(dst
, src
[0], src
[1]));
119 emit(DPH(dst
, src
[0], src
[1]));
124 if (vpi
->DstReg
.WriteMask
& WRITEMASK_X
) {
125 t
.writemask
= WRITEMASK_X
;
126 emit(MOV(t
, src_reg(1.0f
)));
128 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
129 t
.writemask
= WRITEMASK_Y
;
130 emit(MUL(t
, src
[0], src
[1]));
132 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
133 t
.writemask
= WRITEMASK_Z
;
134 emit(MOV(t
, src
[0]));
136 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
137 t
.writemask
= WRITEMASK_W
;
138 emit(MOV(t
, src
[1]));
144 dst_reg result
= dst
;
145 if (vpi
->DstReg
.WriteMask
& WRITEMASK_X
) {
146 /* tmp_d = floor(src[0].x) */
147 src_reg tmp_d
= src_reg(this, glsl_type::ivec4_type
);
148 assert(tmp_d
.type
== BRW_REGISTER_TYPE_D
);
149 emit(RNDD(dst_reg(tmp_d
), swizzle(src
[0], BRW_SWIZZLE_XXXX
)));
151 /* result[0] = 2.0 ^ tmp */
152 /* Adjust exponent for floating point: exp += 127 */
153 dst_reg
tmp_d_x(GRF
, tmp_d
.reg
, glsl_type::int_type
, WRITEMASK_X
);
154 emit(ADD(tmp_d_x
, tmp_d
, src_reg(127)));
156 /* Install exponent and sign. Excess drops off the edge: */
157 dst_reg
res_d_x(GRF
, result
.reg
, glsl_type::int_type
, WRITEMASK_X
);
158 emit(BRW_OPCODE_SHL
, res_d_x
, tmp_d
, src_reg(23));
160 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
161 result
.writemask
= WRITEMASK_Y
;
162 emit(FRC(result
, src
[0]));
164 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
165 result
.writemask
= WRITEMASK_Z
;
166 emit_math(SHADER_OPCODE_EXP2
, result
, src
[0]);
168 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
169 result
.writemask
= WRITEMASK_W
;
170 emit(MOV(result
, src_reg(1.0f
)));
176 emit_math(SHADER_OPCODE_EXP2
, dst
, src
[0]);
180 emit(RNDD(dst
, src
[0]));
184 emit(FRC(dst
, src
[0]));
188 emit_math(SHADER_OPCODE_LOG2
, dst
, src
[0]);
192 dst_reg result
= dst
;
193 /* From the ARB_vertex_program spec:
195 * tmp = VectorLoad(op0);
196 * if (tmp.x < 0) tmp.x = 0;
197 * if (tmp.y < 0) tmp.y = 0;
198 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
199 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
202 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
205 * Note that we don't do the clamping to +/- 128. We didn't in
206 * brw_vs_emit.c either.
208 if (vpi
->DstReg
.WriteMask
& WRITEMASK_XW
) {
209 result
.writemask
= WRITEMASK_XW
;
210 emit(MOV(result
, src_reg(1.0f
)));
212 if (vpi
->DstReg
.WriteMask
& WRITEMASK_YZ
) {
213 result
.writemask
= WRITEMASK_YZ
;
214 emit(MOV(result
, src_reg(0.0f
)));
216 src_reg tmp_x
= swizzle(src
[0], BRW_SWIZZLE_XXXX
);
218 emit(CMP(dst_null_d(), tmp_x
, src_reg(0.0f
), BRW_CONDITIONAL_G
));
219 emit(IF(BRW_PREDICATE_NORMAL
));
221 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
222 result
.writemask
= WRITEMASK_Y
;
223 emit(MOV(result
, tmp_x
));
226 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
227 /* if (tmp.y < 0) tmp.y = 0; */
228 src_reg tmp_y
= swizzle(src
[0], BRW_SWIZZLE_YYYY
);
229 result
.writemask
= WRITEMASK_Z
;
230 emit_minmax(BRW_CONDITIONAL_G
, result
, tmp_y
, src_reg(0.0f
));
232 src_reg
clamped_y(result
);
233 clamped_y
.swizzle
= BRW_SWIZZLE_ZZZZ
;
235 src_reg tmp_w
= swizzle(src
[0], BRW_SWIZZLE_WWWW
);
237 emit_math(SHADER_OPCODE_POW
, result
, clamped_y
, tmp_w
);
239 emit(BRW_OPCODE_ENDIF
);
245 dst_reg result
= dst
;
246 result
.type
= BRW_REGISTER_TYPE_UD
;
247 src_reg result_src
= src_reg(result
);
249 src_reg arg0_ud
= swizzle(src
[0], BRW_SWIZZLE_XXXX
);
250 arg0_ud
.type
= BRW_REGISTER_TYPE_UD
;
252 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
255 * These almost look likey they could be joined up, but not really
258 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
259 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
261 if (vpi
->DstReg
.WriteMask
& WRITEMASK_XZ
) {
262 result
.writemask
= WRITEMASK_X
;
263 emit(AND(result
, arg0_ud
, src_reg((1u << 31) - 1)));
264 emit(BRW_OPCODE_SHR
, result
, result_src
, src_reg(23u));
265 src_reg
result_d(result_src
);
266 result_d
.type
= BRW_REGISTER_TYPE_D
; /* does it matter? */
267 result
.type
= BRW_REGISTER_TYPE_F
;
268 emit(ADD(result
, result_d
, src_reg(-127)));
271 if (vpi
->DstReg
.WriteMask
& WRITEMASK_YZ
) {
272 result
.writemask
= WRITEMASK_Y
;
273 result
.type
= BRW_REGISTER_TYPE_UD
;
274 emit(AND(result
, arg0_ud
, src_reg((1u << 23) - 1)));
275 emit(OR(result
, result_src
, src_reg(127u << 23)));
278 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
279 /* result[2] = result[0] + LOG2(result[1]); */
281 /* Why bother? The above is just a hint how to do this with a
282 * taylor series. Maybe we *should* use a taylor series as by
283 * the time all the above has been done it's almost certainly
284 * quicker than calling the mathbox, even with low precision.
287 * - result[0] + mathbox.LOG2(result[1])
288 * - mathbox.LOG2(arg0.x)
289 * - result[0] + inline_taylor_approx(result[1])
291 result
.type
= BRW_REGISTER_TYPE_F
;
292 result
.writemask
= WRITEMASK_Z
;
293 src_reg
result_x(result
), result_y(result
), result_z(result
);
294 result_x
.swizzle
= BRW_SWIZZLE_XXXX
;
295 result_y
.swizzle
= BRW_SWIZZLE_YYYY
;
296 result_z
.swizzle
= BRW_SWIZZLE_ZZZZ
;
297 emit_math(SHADER_OPCODE_LOG2
, result
, result_y
);
298 emit(ADD(result
, result_z
, result_x
));
301 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
302 result
.type
= BRW_REGISTER_TYPE_F
;
303 result
.writemask
= WRITEMASK_W
;
304 emit(MOV(result
, src_reg(1.0f
)));
310 src_reg temp
= src_reg(this, glsl_type::vec4_type
);
311 emit(MUL(dst_reg(temp
), src
[0], src
[1]));
312 emit(ADD(dst
, temp
, src
[2]));
317 emit_minmax(BRW_CONDITIONAL_G
, dst
, src
[0], src
[1]);
321 emit_minmax(BRW_CONDITIONAL_L
, dst
, src
[0], src
[1]);
325 emit(MOV(dst
, src
[0]));
329 emit(MUL(dst
, src
[0], src
[1]));
333 emit_math(SHADER_OPCODE_POW
, dst
, src
[0], src
[1]);
337 emit_math(SHADER_OPCODE_RCP
, dst
, src
[0]);
341 emit_math(SHADER_OPCODE_RSQ
, dst
, src
[0]);
345 emit_vp_sop(BRW_CONDITIONAL_GE
, dst
, src
[0], src
[1], one
);
349 emit_vp_sop(BRW_CONDITIONAL_L
, dst
, src
[0], src
[1], one
);
353 src_reg neg_src1
= src
[1];
354 neg_src1
.negate
= !src
[1].negate
;
355 emit(ADD(dst
, src
[0], neg_src1
));
360 /* Note that SWZ's extended swizzles are handled in the general
361 * get_src_reg() code.
363 emit(MOV(dst
, src
[0]));
367 src_reg t1
= src_reg(this, glsl_type::vec4_type
);
368 src_reg t2
= src_reg(this, glsl_type::vec4_type
);
370 emit(MUL(dst_reg(t1
),
371 swizzle(src
[0], BRW_SWIZZLE_YZXW
),
372 swizzle(src
[1], BRW_SWIZZLE_ZXYW
)));
373 emit(MUL(dst_reg(t2
),
374 swizzle(src
[0], BRW_SWIZZLE_ZXYW
),
375 swizzle(src
[1], BRW_SWIZZLE_YZXW
)));
377 emit(ADD(dst
, t1
, t2
));
385 _mesa_problem(ctx
, "Unsupported opcode %s in vertex program\n",
386 _mesa_opcode_string(vpi
->Opcode
));
389 /* Copy the temporary back into the actual destination register. */
390 if (vpi
->Opcode
!= OPCODE_END
) {
391 emit(MOV(get_vp_dst_reg(vpi
->DstReg
), src_reg(dst
)));
395 /* If we used relative addressing, we need to upload all constants as
396 * pull constants. Do that now.
398 if (this->need_all_constants_in_pull_buffer
) {
399 const struct gl_program_parameter_list
*params
=
400 vs_compile
->vp
->program
.Base
.Parameters
;
402 for (i
= 0; i
< params
->NumParameters
* 4; i
++) {
403 stage_prog_data
->pull_param
[i
] =
404 ¶ms
->ParameterValues
[i
/ 4][i
% 4].f
;
406 stage_prog_data
->nr_pull_params
= i
;
411 vec4_vs_visitor::setup_vp_regs()
413 /* PROGRAM_TEMPORARY */
414 int num_temp
= prog
->NumTemporaries
;
415 vp_temp_regs
= rzalloc_array(mem_ctx
, src_reg
, num_temp
);
416 for (int i
= 0; i
< num_temp
; i
++)
417 vp_temp_regs
[i
] = src_reg(this, glsl_type::vec4_type
);
419 /* PROGRAM_STATE_VAR etc. */
420 struct gl_program_parameter_list
*plist
=
421 vs_compile
->vp
->program
.Base
.Parameters
;
422 for (unsigned p
= 0; p
< plist
->NumParameters
; p
++) {
423 unsigned components
= plist
->Parameters
[p
].Size
;
425 /* Parameters should be either vec4 uniforms or single component
426 * constants; matrices and other larger types should have been broken
429 assert(components
<= 4);
431 this->uniform_size
[this->uniforms
] = 1; /* 1 vec4 */
432 this->uniform_vector_size
[this->uniforms
] = components
;
433 for (unsigned i
= 0; i
< 4; i
++) {
434 stage_prog_data
->param
[this->uniforms
* 4 + i
] = i
>= components
435 ? 0 : &plist
->ParameterValues
[p
][i
].f
;
437 this->uniforms
++; /* counted in vec4 units */
441 for (int slot
= 0; slot
< prog_data
->vue_map
.num_slots
; slot
++) {
442 int varying
= prog_data
->vue_map
.slot_to_varying
[slot
];
443 if (varying
== VARYING_SLOT_PSIZ
)
444 output_reg
[varying
] = dst_reg(this, glsl_type::float_type
);
446 output_reg
[varying
] = dst_reg(this, glsl_type::vec4_type
);
447 assert(output_reg
[varying
].type
== BRW_REGISTER_TYPE_F
);
450 /* PROGRAM_ADDRESS */
451 this->vp_addr_reg
= src_reg(this, glsl_type::int_type
);
452 assert(this->vp_addr_reg
.type
== BRW_REGISTER_TYPE_D
);
456 vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register
&dst
)
460 assert(!dst
.RelAddr
);
463 case PROGRAM_TEMPORARY
:
464 result
= dst_reg(vp_temp_regs
[dst
.Index
]);
468 result
= output_reg
[dst
.Index
];
471 case PROGRAM_ADDRESS
: {
472 assert(dst
.Index
== 0);
473 result
= dst_reg(this->vp_addr_reg
);
477 case PROGRAM_UNDEFINED
:
481 unreachable("vec4_vp: bad destination register file");
484 result
.writemask
= dst
.WriteMask
;
489 vec4_vs_visitor::get_vp_src_reg(const prog_src_register
&src
)
491 struct gl_program_parameter_list
*plist
=
492 vs_compile
->vp
->program
.Base
.Parameters
;
499 case PROGRAM_UNDEFINED
:
500 return src_reg(brw_null_reg());
502 case PROGRAM_TEMPORARY
:
503 result
= vp_temp_regs
[src
.Index
];
507 result
= src_reg(ATTR
, src
.Index
, glsl_type::vec4_type
);
508 result
.type
= BRW_REGISTER_TYPE_F
;
511 case PROGRAM_ADDRESS
: {
512 assert(src
.Index
== 0);
513 result
= this->vp_addr_reg
;
517 case PROGRAM_STATE_VAR
:
518 case PROGRAM_CONSTANT
:
519 /* From the ARB_vertex_program specification:
520 * "Relative addressing can only be used for accessing program
524 /* Since we have no idea what the base of the array is, we need to
525 * upload ALL constants as push constants.
527 this->need_all_constants_in_pull_buffer
= true;
529 /* Add the small constant index to the address register */
530 src_reg reladdr
= src_reg(this, glsl_type::int_type
);
531 dst_reg dst_reladdr
= dst_reg(reladdr
);
532 dst_reladdr
.writemask
= WRITEMASK_X
;
533 emit(ADD(dst_reladdr
, this->vp_addr_reg
, src_reg(src
.Index
)));
536 emit(MUL(dst_reladdr
, reladdr
, src_reg(16)));
539 assert(src
.Index
< this->uniforms
);
540 result
= src_reg(dst_reg(UNIFORM
, 0));
541 result
.type
= BRW_REGISTER_TYPE_F
;
542 result
.reladdr
= new(mem_ctx
) src_reg();
543 memcpy(result
.reladdr
, &reladdr
, sizeof(src_reg
));
546 result
= src_reg(this, glsl_type::vec4_type
);
547 src_reg surf_index
= src_reg(unsigned(prog_data
->base
.binding_table
.pull_constants_start
));
548 vec4_instruction
*load
=
549 new(mem_ctx
) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD
,
550 dst_reg(result
), surf_index
, reladdr
);
557 /* We actually want to look at the type in the Parameters list for this,
558 * because this lets us upload constant builtin uniforms as actual
561 switch (plist
->Parameters
[src
.Index
].Type
) {
562 case PROGRAM_CONSTANT
:
563 result
= src_reg(this, glsl_type::vec4_type
);
564 for (int i
= 0; i
< 4; i
++) {
565 dst_reg t
= dst_reg(result
);
566 t
.writemask
= 1 << i
;
567 emit(MOV(t
, src_reg(plist
->ParameterValues
[src
.Index
][i
].f
)));
571 case PROGRAM_STATE_VAR
:
572 assert(src
.Index
< this->uniforms
);
573 result
= src_reg(dst_reg(UNIFORM
, src
.Index
));
574 result
.type
= BRW_REGISTER_TYPE_F
;
578 _mesa_problem(ctx
, "bad uniform src register file: %s\n",
579 _mesa_register_file_name((gl_register_file
)src
.File
));
580 return src_reg(this, glsl_type::vec4_type
);
585 _mesa_problem(ctx
, "bad src register file: %s\n",
586 _mesa_register_file_name((gl_register_file
)src
.File
));
587 return src_reg(this, glsl_type::vec4_type
);
590 if (src
.Swizzle
!= SWIZZLE_NOOP
|| src
.Negate
) {
591 unsigned short zeros_mask
= 0;
592 unsigned short ones_mask
= 0;
593 unsigned short src_mask
= 0;
594 unsigned short src_swiz
[4];
596 for (int i
= 0; i
< 4; i
++) {
597 src_swiz
[i
] = 0; /* initialize for safety */
599 /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
600 * but it's simplest to handle it here.
602 int s
= GET_SWZ(src
.Swizzle
, i
);
612 zeros_mask
|= 1 << i
;
621 BRW_SWIZZLE4(src_swiz
[0], src_swiz
[1], src_swiz
[2], src_swiz
[3]);
623 /* The hardware doesn't natively handle the SWZ instruction's zero/one
624 * swizzles or per-component negation, so we need to use a temporary.
626 if (zeros_mask
|| ones_mask
|| src
.Negate
) {
627 src_reg
temp_src(this, glsl_type::vec4_type
);
628 dst_reg
temp(temp_src
);
631 temp
.writemask
= src_mask
;
632 emit(MOV(temp
, result
));
636 temp
.writemask
= zeros_mask
;
637 emit(MOV(temp
, src_reg(0.0f
)));
641 temp
.writemask
= ones_mask
;
642 emit(MOV(temp
, src_reg(1.0f
)));
646 temp
.writemask
= src
.Negate
;
647 src_reg
neg(temp_src
);
649 emit(MOV(temp
, neg
));