2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_vec4_vp.cpp
26 * A translator from Mesa IR to the i965 driver's Vec4 IR, used to implement
27 * ARB_vertex_program and fixed-function vertex processing.
30 #include "brw_context.h"
33 #include "program/prog_parameter.h"
34 #include "program/prog_print.h"
39 vec4_visitor::emit_vp_sop(uint32_t conditional_mod
,
40 dst_reg dst
, src_reg src0
, src_reg src1
,
43 vec4_instruction
*inst
;
45 inst
= emit(BRW_OPCODE_CMP
, dst_null_d(), src0
, src1
);
46 inst
->conditional_mod
= conditional_mod
;
48 inst
= emit(BRW_OPCODE_SEL
, dst
, one
, src_reg(0.0f
));
49 inst
->predicate
= BRW_PREDICATE_NORMAL
;
53 * Reswizzle a given source register.
57 reswizzle(src_reg orig
, unsigned x
, unsigned y
, unsigned z
, unsigned w
)
60 t
.swizzle
= BRW_SWIZZLE4(BRW_GET_SWZ(orig
.swizzle
, x
),
61 BRW_GET_SWZ(orig
.swizzle
, y
),
62 BRW_GET_SWZ(orig
.swizzle
, z
),
63 BRW_GET_SWZ(orig
.swizzle
, w
));
68 vec4_vs_visitor::emit_program_code()
70 this->need_all_constants_in_pull_buffer
= false;
74 /* Keep a reg with 1.0 around, for reuse by emit_vs_sop so that it can just
84 src_reg one
= src_reg(this, glsl_type::float_type
);
85 emit(MOV(dst_reg(one
), src_reg(1.0f
)));
87 for (unsigned int insn
= 0; insn
< prog
->NumInstructions
; insn
++) {
88 const struct prog_instruction
*vpi
= &prog
->Instructions
[insn
];
94 /* We always emit into a temporary destination register to avoid
97 dst
= dst_reg(this, glsl_type::vec4_type
);
99 for (int i
= 0; i
< 3; i
++)
100 src
[i
] = get_vp_src_reg(vpi
->SrcReg
[i
]);
102 switch (vpi
->Opcode
) {
105 src
[0].negate
= false;
106 emit(MOV(dst
, src
[0]));
110 emit(ADD(dst
, src
[0], src
[1]));
114 if (intel
->gen
>= 6) {
115 dst
.writemask
= WRITEMASK_X
;
117 dst_f
.type
= BRW_REGISTER_TYPE_F
;
119 emit(RNDD(dst_f
, src
[0]));
120 emit(MOV(dst
, src_reg(dst_f
)));
122 emit(RNDD(dst
, src
[0]));
127 emit(DP3(dst
, src
[0], src
[1]));
130 emit(DP4(dst
, src
[0], src
[1]));
133 emit(DPH(dst
, src
[0], src
[1]));
138 if (vpi
->DstReg
.WriteMask
& WRITEMASK_X
) {
139 t
.writemask
= WRITEMASK_X
;
140 emit(MOV(t
, src_reg(1.0f
)));
142 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
143 t
.writemask
= WRITEMASK_Y
;
144 emit(MUL(t
, src
[0], src
[1]));
146 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
147 t
.writemask
= WRITEMASK_Z
;
148 emit(MOV(t
, src
[0]));
150 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
151 t
.writemask
= WRITEMASK_W
;
152 emit(MOV(t
, src
[1]));
158 dst_reg result
= dst
;
159 if (vpi
->DstReg
.WriteMask
& WRITEMASK_X
) {
160 /* tmp_d = floor(src[0].x) */
161 src_reg tmp_d
= src_reg(this, glsl_type::ivec4_type
);
162 assert(tmp_d
.type
== BRW_REGISTER_TYPE_D
);
163 emit(RNDD(dst_reg(tmp_d
), reswizzle(src
[0], 0, 0, 0, 0)));
165 /* result[0] = 2.0 ^ tmp */
166 /* Adjust exponent for floating point: exp += 127 */
167 dst_reg
tmp_d_x(GRF
, tmp_d
.reg
, glsl_type::int_type
, WRITEMASK_X
);
168 emit(ADD(tmp_d_x
, tmp_d
, src_reg(127)));
170 /* Install exponent and sign. Excess drops off the edge: */
171 dst_reg
res_d_x(GRF
, result
.reg
, glsl_type::int_type
, WRITEMASK_X
);
172 emit(BRW_OPCODE_SHL
, res_d_x
, tmp_d
, src_reg(23));
174 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
175 result
.writemask
= WRITEMASK_Y
;
176 emit(FRC(result
, src
[0]));
178 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
179 result
.writemask
= WRITEMASK_Z
;
180 emit_math(SHADER_OPCODE_EXP2
, result
, src
[0]);
182 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
183 result
.writemask
= WRITEMASK_W
;
184 emit(MOV(result
, src_reg(1.0f
)));
190 emit_math(SHADER_OPCODE_EXP2
, dst
, src
[0]);
194 emit(RNDD(dst
, src
[0]));
198 emit(FRC(dst
, src
[0]));
202 emit_math(SHADER_OPCODE_LOG2
, dst
, src
[0]);
206 dst_reg result
= dst
;
207 /* From the ARB_vertex_program spec:
209 * tmp = VectorLoad(op0);
210 * if (tmp.x < 0) tmp.x = 0;
211 * if (tmp.y < 0) tmp.y = 0;
212 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
213 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
216 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
219 * Note that we don't do the clamping to +/- 128. We didn't in
220 * brw_vs_emit.c either.
222 if (vpi
->DstReg
.WriteMask
& WRITEMASK_XW
) {
223 result
.writemask
= WRITEMASK_XW
;
224 emit(MOV(result
, src_reg(1.0f
)));
226 if (vpi
->DstReg
.WriteMask
& WRITEMASK_YZ
) {
227 result
.writemask
= WRITEMASK_YZ
;
228 emit(MOV(result
, src_reg(0.0f
)));
230 src_reg tmp_x
= reswizzle(src
[0], 0, 0, 0, 0);
232 emit(CMP(dst_null_d(), tmp_x
, src_reg(0.0f
), BRW_CONDITIONAL_G
));
233 emit(IF(BRW_PREDICATE_NORMAL
));
235 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
236 result
.writemask
= WRITEMASK_Y
;
237 emit(MOV(result
, tmp_x
));
240 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
241 /* if (tmp.y < 0) tmp.y = 0; */
242 src_reg tmp_y
= reswizzle(src
[0], 1, 1, 1, 1);
243 result
.writemask
= WRITEMASK_Z
;
244 emit_minmax(BRW_CONDITIONAL_G
, result
, tmp_y
, src_reg(0.0f
));
246 src_reg
clamped_y(result
);
247 clamped_y
.swizzle
= BRW_SWIZZLE_ZZZZ
;
249 src_reg tmp_w
= reswizzle(src
[0], 3, 3, 3, 3);
251 emit_math(SHADER_OPCODE_POW
, result
, clamped_y
, tmp_w
);
253 emit(BRW_OPCODE_ENDIF
);
259 dst_reg result
= dst
;
260 result
.type
= BRW_REGISTER_TYPE_UD
;
261 src_reg result_src
= src_reg(result
);
263 src_reg arg0_ud
= reswizzle(src
[0], 0, 0, 0, 0);
264 arg0_ud
.type
= BRW_REGISTER_TYPE_UD
;
266 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
269 * These almost look likey they could be joined up, but not really
272 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
273 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
275 if (vpi
->DstReg
.WriteMask
& WRITEMASK_XZ
) {
276 result
.writemask
= WRITEMASK_X
;
277 emit(AND(result
, arg0_ud
, src_reg((1u << 31) - 1)));
278 emit(BRW_OPCODE_SHR
, result
, result_src
, src_reg(23u));
279 src_reg
result_d(result_src
);
280 result_d
.type
= BRW_REGISTER_TYPE_D
; /* does it matter? */
281 result
.type
= BRW_REGISTER_TYPE_F
;
282 emit(ADD(result
, result_d
, src_reg(-127)));
285 if (vpi
->DstReg
.WriteMask
& WRITEMASK_YZ
) {
286 result
.writemask
= WRITEMASK_Y
;
287 result
.type
= BRW_REGISTER_TYPE_UD
;
288 emit(AND(result
, arg0_ud
, src_reg((1u << 23) - 1)));
289 emit(OR(result
, result_src
, src_reg(127u << 23)));
292 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
293 /* result[2] = result[0] + LOG2(result[1]); */
295 /* Why bother? The above is just a hint how to do this with a
296 * taylor series. Maybe we *should* use a taylor series as by
297 * the time all the above has been done it's almost certainly
298 * quicker than calling the mathbox, even with low precision.
301 * - result[0] + mathbox.LOG2(result[1])
302 * - mathbox.LOG2(arg0.x)
303 * - result[0] + inline_taylor_approx(result[1])
305 result
.type
= BRW_REGISTER_TYPE_F
;
306 result
.writemask
= WRITEMASK_Z
;
307 src_reg
result_x(result
), result_y(result
), result_z(result
);
308 result_x
.swizzle
= BRW_SWIZZLE_XXXX
;
309 result_y
.swizzle
= BRW_SWIZZLE_YYYY
;
310 result_z
.swizzle
= BRW_SWIZZLE_ZZZZ
;
311 emit_math(SHADER_OPCODE_LOG2
, result
, result_y
);
312 emit(ADD(result
, result_z
, result_x
));
315 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
316 result
.type
= BRW_REGISTER_TYPE_F
;
317 result
.writemask
= WRITEMASK_W
;
318 emit(MOV(result
, src_reg(1.0f
)));
324 src_reg temp
= src_reg(this, glsl_type::vec4_type
);
325 emit(MUL(dst_reg(temp
), src
[0], src
[1]));
326 emit(ADD(dst
, temp
, src
[2]));
331 emit_minmax(BRW_CONDITIONAL_G
, dst
, src
[0], src
[1]);
335 emit_minmax(BRW_CONDITIONAL_L
, dst
, src
[0], src
[1]);
339 emit(MOV(dst
, src
[0]));
343 emit(MUL(dst
, src
[0], src
[1]));
347 emit_math(SHADER_OPCODE_POW
, dst
, src
[0], src
[1]);
351 emit_math(SHADER_OPCODE_RCP
, dst
, src
[0]);
355 emit_math(SHADER_OPCODE_RSQ
, dst
, src
[0]);
359 emit_vp_sop(BRW_CONDITIONAL_GE
, dst
, src
[0], src
[1], one
);
363 emit_vp_sop(BRW_CONDITIONAL_L
, dst
, src
[0], src
[1], one
);
367 src_reg neg_src1
= src
[1];
368 neg_src1
.negate
= !src
[1].negate
;
369 emit(ADD(dst
, src
[0], neg_src1
));
374 /* Note that SWZ's extended swizzles are handled in the general
375 * get_src_reg() code.
377 emit(MOV(dst
, src
[0]));
381 src_reg t1
= src_reg(this, glsl_type::vec4_type
);
382 src_reg t2
= src_reg(this, glsl_type::vec4_type
);
384 emit(MUL(dst_reg(t1
),
385 reswizzle(src
[0], 1, 2, 0, 3),
386 reswizzle(src
[1], 2, 0, 1, 3)));
387 emit(MUL(dst_reg(t2
),
388 reswizzle(src
[0], 2, 0, 1, 3),
389 reswizzle(src
[1], 1, 2, 0, 3)));
391 emit(ADD(dst
, t1
, t2
));
399 _mesa_problem(ctx
, "Unsupported opcode %s in vertex program\n",
400 _mesa_opcode_string(vpi
->Opcode
));
403 /* Copy the temporary back into the actual destination register. */
404 if (vpi
->Opcode
!= OPCODE_END
) {
405 emit(MOV(get_vp_dst_reg(vpi
->DstReg
), src_reg(dst
)));
409 /* If we used relative addressing, we need to upload all constants as
410 * pull constants. Do that now.
412 if (this->need_all_constants_in_pull_buffer
) {
413 const struct gl_program_parameter_list
*params
= c
->vp
->program
.Base
.Parameters
;
415 for (i
= 0; i
< params
->NumParameters
* 4; i
++) {
416 prog_data
->base
.pull_param
[i
] = ¶ms
->ParameterValues
[i
/ 4][i
% 4].f
;
418 prog_data
->base
.nr_pull_params
= i
;
423 vec4_visitor::setup_vp_regs()
425 /* PROGRAM_TEMPORARY */
426 int num_temp
= prog
->NumTemporaries
;
427 vp_temp_regs
= rzalloc_array(mem_ctx
, src_reg
, num_temp
);
428 for (int i
= 0; i
< num_temp
; i
++)
429 vp_temp_regs
[i
] = src_reg(this, glsl_type::vec4_type
);
431 /* PROGRAM_STATE_VAR etc. */
432 struct gl_program_parameter_list
*plist
= c
->vp
->program
.Base
.Parameters
;
433 for (unsigned p
= 0; p
< plist
->NumParameters
; p
++) {
434 unsigned components
= plist
->Parameters
[p
].Size
;
436 /* Parameters should be either vec4 uniforms or single component
437 * constants; matrices and other larger types should have been broken
440 assert(components
<= 4);
442 this->uniform_size
[this->uniforms
] = 1; /* 1 vec4 */
443 this->uniform_vector_size
[this->uniforms
] = components
;
444 for (unsigned i
= 0; i
< 4; i
++) {
445 prog_data
->base
.param
[this->uniforms
* 4 + i
] = i
>= components
446 ? 0 : &plist
->ParameterValues
[p
][i
].f
;
448 this->uniforms
++; /* counted in vec4 units */
452 for (int slot
= 0; slot
< prog_data
->base
.vue_map
.num_slots
; slot
++) {
453 int varying
= prog_data
->base
.vue_map
.slot_to_varying
[slot
];
454 if (varying
== VARYING_SLOT_PSIZ
)
455 output_reg
[varying
] = dst_reg(this, glsl_type::float_type
);
457 output_reg
[varying
] = dst_reg(this, glsl_type::vec4_type
);
458 assert(output_reg
[varying
].type
== BRW_REGISTER_TYPE_F
);
461 /* PROGRAM_ADDRESS */
462 this->vp_addr_reg
= src_reg(this, glsl_type::int_type
);
463 assert(this->vp_addr_reg
.type
== BRW_REGISTER_TYPE_D
);
467 vec4_visitor::get_vp_dst_reg(const prog_dst_register
&dst
)
471 assert(!dst
.RelAddr
);
474 case PROGRAM_TEMPORARY
:
475 result
= dst_reg(vp_temp_regs
[dst
.Index
]);
479 result
= output_reg
[dst
.Index
];
482 case PROGRAM_ADDRESS
: {
483 assert(dst
.Index
== 0);
484 result
= dst_reg(this->vp_addr_reg
);
488 case PROGRAM_UNDEFINED
:
492 assert("vec4_vp: bad destination register file");
493 return dst_reg(this, glsl_type::vec4_type
);
496 result
.writemask
= dst
.WriteMask
;
501 vec4_visitor::get_vp_src_reg(const prog_src_register
&src
)
503 struct gl_program_parameter_list
*plist
= c
->vp
->program
.Base
.Parameters
;
510 case PROGRAM_UNDEFINED
:
511 return src_reg(brw_null_reg());
513 case PROGRAM_TEMPORARY
:
514 result
= vp_temp_regs
[src
.Index
];
518 result
= src_reg(ATTR
, src
.Index
, glsl_type::vec4_type
);
519 result
.type
= BRW_REGISTER_TYPE_F
;
522 case PROGRAM_ADDRESS
: {
523 assert(src
.Index
== 0);
524 result
= this->vp_addr_reg
;
528 case PROGRAM_STATE_VAR
:
529 case PROGRAM_CONSTANT
:
530 /* From the ARB_vertex_program specification:
531 * "Relative addressing can only be used for accessing program
535 /* Since we have no idea what the base of the array is, we need to
536 * upload ALL constants as push constants.
538 this->need_all_constants_in_pull_buffer
= true;
540 /* Add the small constant index to the address register */
541 src_reg reladdr
= src_reg(this, glsl_type::int_type
);
542 dst_reg dst_reladdr
= dst_reg(reladdr
);
543 dst_reladdr
.writemask
= WRITEMASK_X
;
544 emit(ADD(dst_reladdr
, this->vp_addr_reg
, src_reg(src
.Index
)));
547 emit(MUL(dst_reladdr
, reladdr
, src_reg(16)));
550 assert(src
.Index
< this->uniforms
);
551 result
= src_reg(dst_reg(UNIFORM
, 0));
552 result
.type
= BRW_REGISTER_TYPE_F
;
553 result
.reladdr
= new(mem_ctx
) src_reg();
554 memcpy(result
.reladdr
, &reladdr
, sizeof(src_reg
));
557 result
= src_reg(this, glsl_type::vec4_type
);
558 src_reg surf_index
= src_reg(unsigned(SURF_INDEX_VERT_CONST_BUFFER
));
559 vec4_instruction
*load
=
560 new(mem_ctx
) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD
,
561 dst_reg(result
), surf_index
, reladdr
);
568 /* We actually want to look at the type in the Parameters list for this,
569 * because this lets us upload constant builtin uniforms as actual
572 switch (plist
->Parameters
[src
.Index
].Type
) {
573 case PROGRAM_CONSTANT
:
574 result
= src_reg(this, glsl_type::vec4_type
);
575 for (int i
= 0; i
< 4; i
++) {
576 dst_reg t
= dst_reg(result
);
577 t
.writemask
= 1 << i
;
578 emit(MOV(t
, src_reg(plist
->ParameterValues
[src
.Index
][i
].f
)));
582 case PROGRAM_STATE_VAR
:
583 assert(src
.Index
< this->uniforms
);
584 result
= src_reg(dst_reg(UNIFORM
, src
.Index
));
585 result
.type
= BRW_REGISTER_TYPE_F
;
589 _mesa_problem(ctx
, "bad uniform src register file: %s\n",
590 _mesa_register_file_name((gl_register_file
)src
.File
));
591 return src_reg(this, glsl_type::vec4_type
);
596 _mesa_problem(ctx
, "bad src register file: %s\n",
597 _mesa_register_file_name((gl_register_file
)src
.File
));
598 return src_reg(this, glsl_type::vec4_type
);
601 if (src
.Swizzle
!= SWIZZLE_NOOP
|| src
.Negate
) {
602 unsigned short zeros_mask
= 0;
603 unsigned short ones_mask
= 0;
604 unsigned short src_mask
= 0;
605 unsigned short src_swiz
[4];
607 for (int i
= 0; i
< 4; i
++) {
608 src_swiz
[i
] = 0; /* initialize for safety */
610 /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
611 * but it's simplest to handle it here.
613 int s
= GET_SWZ(src
.Swizzle
, i
);
623 zeros_mask
|= 1 << i
;
632 BRW_SWIZZLE4(src_swiz
[0], src_swiz
[1], src_swiz
[2], src_swiz
[3]);
634 /* The hardware doesn't natively handle the SWZ instruction's zero/one
635 * swizzles or per-component negation, so we need to use a temporary.
637 if (zeros_mask
|| ones_mask
|| src
.Negate
) {
638 src_reg
temp_src(this, glsl_type::vec4_type
);
639 dst_reg
temp(temp_src
);
642 temp
.writemask
= src_mask
;
643 emit(MOV(temp
, result
));
647 temp
.writemask
= zeros_mask
;
648 emit(MOV(temp
, src_reg(0.0f
)));
652 temp
.writemask
= ones_mask
;
653 emit(MOV(temp
, src_reg(1.0f
)));
657 temp
.writemask
= src
.Negate
;
658 src_reg
neg(temp_src
);
660 emit(MOV(temp
, neg
));