2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_vec4_vp.cpp
26 * A translator from Mesa IR to the i965 driver's Vec4 IR, used to implement
27 * ARB_vertex_program and fixed-function vertex processing.
30 #include "brw_context.h"
34 #include "program/prog_parameter.h"
35 #include "program/prog_print.h"
40 vec4_visitor::emit_vp_sop(enum brw_conditional_mod conditional_mod
,
41 dst_reg dst
, src_reg src0
, src_reg src1
,
44 vec4_instruction
*inst
;
46 inst
= emit(CMP(dst_null_f(), src0
, src1
, conditional_mod
));
48 inst
= emit(BRW_OPCODE_SEL
, dst
, one
, src_reg(0.0f
));
49 inst
->predicate
= BRW_PREDICATE_NORMAL
;
53 vec4_vs_visitor::emit_program_code()
55 this->need_all_constants_in_pull_buffer
= false;
59 /* Keep a reg with 1.0 around, for reuse by emit_vs_sop so that it can just
69 src_reg one
= src_reg(this, glsl_type::float_type
);
70 emit(MOV(dst_reg(one
), src_reg(1.0f
)));
72 for (unsigned int insn
= 0; insn
< prog
->NumInstructions
; insn
++) {
73 const struct prog_instruction
*vpi
= &prog
->Instructions
[insn
];
79 /* We always emit into a temporary destination register to avoid
82 dst
= dst_reg(this, glsl_type::vec4_type
);
84 for (int i
= 0; i
< 3; i
++)
85 src
[i
] = get_vp_src_reg(vpi
->SrcReg
[i
]);
87 switch (vpi
->Opcode
) {
90 src
[0].negate
= false;
91 emit(MOV(dst
, src
[0]));
95 emit(ADD(dst
, src
[0], src
[1]));
100 dst
.writemask
= WRITEMASK_X
;
102 dst_f
.type
= BRW_REGISTER_TYPE_F
;
104 emit(RNDD(dst_f
, src
[0]));
105 emit(MOV(dst
, src_reg(dst_f
)));
107 emit(RNDD(dst
, src
[0]));
112 emit(DP3(dst
, src
[0], src
[1]));
115 emit(DP4(dst
, src
[0], src
[1]));
118 emit(DPH(dst
, src
[0], src
[1]));
123 if (vpi
->DstReg
.WriteMask
& WRITEMASK_X
) {
124 t
.writemask
= WRITEMASK_X
;
125 emit(MOV(t
, src_reg(1.0f
)));
127 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
128 t
.writemask
= WRITEMASK_Y
;
129 emit(MUL(t
, src
[0], src
[1]));
131 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
132 t
.writemask
= WRITEMASK_Z
;
133 emit(MOV(t
, src
[0]));
135 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
136 t
.writemask
= WRITEMASK_W
;
137 emit(MOV(t
, src
[1]));
143 dst_reg result
= dst
;
144 if (vpi
->DstReg
.WriteMask
& WRITEMASK_X
) {
145 /* tmp_d = floor(src[0].x) */
146 src_reg tmp_d
= src_reg(this, glsl_type::ivec4_type
);
147 assert(tmp_d
.type
== BRW_REGISTER_TYPE_D
);
148 emit(RNDD(dst_reg(tmp_d
), swizzle(src
[0], BRW_SWIZZLE_XXXX
)));
150 /* result[0] = 2.0 ^ tmp */
151 /* Adjust exponent for floating point: exp += 127 */
152 dst_reg
tmp_d_x(GRF
, tmp_d
.reg
, glsl_type::int_type
, WRITEMASK_X
);
153 emit(ADD(tmp_d_x
, tmp_d
, src_reg(127)));
155 /* Install exponent and sign. Excess drops off the edge: */
156 dst_reg
res_d_x(GRF
, result
.reg
, glsl_type::int_type
, WRITEMASK_X
);
157 emit(BRW_OPCODE_SHL
, res_d_x
, tmp_d
, src_reg(23));
159 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
160 result
.writemask
= WRITEMASK_Y
;
161 emit(FRC(result
, src
[0]));
163 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
164 result
.writemask
= WRITEMASK_Z
;
165 emit_math(SHADER_OPCODE_EXP2
, result
, src
[0]);
167 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
168 result
.writemask
= WRITEMASK_W
;
169 emit(MOV(result
, src_reg(1.0f
)));
175 emit_math(SHADER_OPCODE_EXP2
, dst
, src
[0]);
179 emit(RNDD(dst
, src
[0]));
183 emit(FRC(dst
, src
[0]));
187 emit_math(SHADER_OPCODE_LOG2
, dst
, src
[0]);
191 dst_reg result
= dst
;
192 /* From the ARB_vertex_program spec:
194 * tmp = VectorLoad(op0);
195 * if (tmp.x < 0) tmp.x = 0;
196 * if (tmp.y < 0) tmp.y = 0;
197 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
198 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
201 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
204 * Note that we don't do the clamping to +/- 128. We didn't in
205 * brw_vs_emit.c either.
207 if (vpi
->DstReg
.WriteMask
& WRITEMASK_XW
) {
208 result
.writemask
= WRITEMASK_XW
;
209 emit(MOV(result
, src_reg(1.0f
)));
211 if (vpi
->DstReg
.WriteMask
& WRITEMASK_YZ
) {
212 result
.writemask
= WRITEMASK_YZ
;
213 emit(MOV(result
, src_reg(0.0f
)));
215 src_reg tmp_x
= swizzle(src
[0], BRW_SWIZZLE_XXXX
);
217 emit(CMP(dst_null_d(), tmp_x
, src_reg(0.0f
), BRW_CONDITIONAL_G
));
218 emit(IF(BRW_PREDICATE_NORMAL
));
220 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Y
) {
221 result
.writemask
= WRITEMASK_Y
;
222 emit(MOV(result
, tmp_x
));
225 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
226 /* if (tmp.y < 0) tmp.y = 0; */
227 src_reg tmp_y
= swizzle(src
[0], BRW_SWIZZLE_YYYY
);
228 result
.writemask
= WRITEMASK_Z
;
229 emit_minmax(BRW_CONDITIONAL_GE
, result
, tmp_y
, src_reg(0.0f
));
231 src_reg
clamped_y(result
);
232 clamped_y
.swizzle
= BRW_SWIZZLE_ZZZZ
;
234 src_reg tmp_w
= swizzle(src
[0], BRW_SWIZZLE_WWWW
);
236 emit_math(SHADER_OPCODE_POW
, result
, clamped_y
, tmp_w
);
238 emit(BRW_OPCODE_ENDIF
);
244 dst_reg result
= dst
;
245 result
.type
= BRW_REGISTER_TYPE_UD
;
246 src_reg result_src
= src_reg(result
);
248 src_reg arg0_ud
= swizzle(src
[0], BRW_SWIZZLE_XXXX
);
249 arg0_ud
.type
= BRW_REGISTER_TYPE_UD
;
251 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
254 * These almost look likey they could be joined up, but not really
257 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
258 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
260 if (vpi
->DstReg
.WriteMask
& WRITEMASK_XZ
) {
261 result
.writemask
= WRITEMASK_X
;
262 emit(AND(result
, arg0_ud
, src_reg((1u << 31) - 1)));
263 emit(BRW_OPCODE_SHR
, result
, result_src
, src_reg(23u));
264 src_reg
result_d(result_src
);
265 result_d
.type
= BRW_REGISTER_TYPE_D
; /* does it matter? */
266 result
.type
= BRW_REGISTER_TYPE_F
;
267 emit(ADD(result
, result_d
, src_reg(-127)));
270 if (vpi
->DstReg
.WriteMask
& WRITEMASK_YZ
) {
271 result
.writemask
= WRITEMASK_Y
;
272 result
.type
= BRW_REGISTER_TYPE_UD
;
273 emit(AND(result
, arg0_ud
, src_reg((1u << 23) - 1)));
274 emit(OR(result
, result_src
, src_reg(127u << 23)));
277 if (vpi
->DstReg
.WriteMask
& WRITEMASK_Z
) {
278 /* result[2] = result[0] + LOG2(result[1]); */
280 /* Why bother? The above is just a hint how to do this with a
281 * taylor series. Maybe we *should* use a taylor series as by
282 * the time all the above has been done it's almost certainly
283 * quicker than calling the mathbox, even with low precision.
286 * - result[0] + mathbox.LOG2(result[1])
287 * - mathbox.LOG2(arg0.x)
288 * - result[0] + inline_taylor_approx(result[1])
290 result
.type
= BRW_REGISTER_TYPE_F
;
291 result
.writemask
= WRITEMASK_Z
;
292 src_reg
result_x(result
), result_y(result
), result_z(result
);
293 result_x
.swizzle
= BRW_SWIZZLE_XXXX
;
294 result_y
.swizzle
= BRW_SWIZZLE_YYYY
;
295 result_z
.swizzle
= BRW_SWIZZLE_ZZZZ
;
296 emit_math(SHADER_OPCODE_LOG2
, result
, result_y
);
297 emit(ADD(result
, result_z
, result_x
));
300 if (vpi
->DstReg
.WriteMask
& WRITEMASK_W
) {
301 result
.type
= BRW_REGISTER_TYPE_F
;
302 result
.writemask
= WRITEMASK_W
;
303 emit(MOV(result
, src_reg(1.0f
)));
309 src_reg temp
= src_reg(this, glsl_type::vec4_type
);
310 emit(MUL(dst_reg(temp
), src
[0], src
[1]));
311 emit(ADD(dst
, temp
, src
[2]));
316 emit_minmax(BRW_CONDITIONAL_GE
, dst
, src
[0], src
[1]);
320 emit_minmax(BRW_CONDITIONAL_L
, dst
, src
[0], src
[1]);
324 emit(MOV(dst
, src
[0]));
328 emit(MUL(dst
, src
[0], src
[1]));
332 emit_math(SHADER_OPCODE_POW
, dst
, src
[0], src
[1]);
336 emit_math(SHADER_OPCODE_RCP
, dst
, src
[0]);
340 emit_math(SHADER_OPCODE_RSQ
, dst
, src
[0]);
344 emit_vp_sop(BRW_CONDITIONAL_GE
, dst
, src
[0], src
[1], one
);
348 emit_vp_sop(BRW_CONDITIONAL_L
, dst
, src
[0], src
[1], one
);
352 src_reg neg_src1
= src
[1];
353 neg_src1
.negate
= !src
[1].negate
;
354 emit(ADD(dst
, src
[0], neg_src1
));
359 /* Note that SWZ's extended swizzles are handled in the general
360 * get_src_reg() code.
362 emit(MOV(dst
, src
[0]));
366 src_reg t1
= src_reg(this, glsl_type::vec4_type
);
367 src_reg t2
= src_reg(this, glsl_type::vec4_type
);
369 emit(MUL(dst_reg(t1
),
370 swizzle(src
[0], BRW_SWIZZLE_YZXW
),
371 swizzle(src
[1], BRW_SWIZZLE_ZXYW
)));
372 emit(MUL(dst_reg(t2
),
373 swizzle(src
[0], BRW_SWIZZLE_ZXYW
),
374 swizzle(src
[1], BRW_SWIZZLE_YZXW
)));
376 emit(ADD(dst
, t1
, t2
));
384 _mesa_problem(ctx
, "Unsupported opcode %s in vertex program\n",
385 _mesa_opcode_string(vpi
->Opcode
));
388 /* Copy the temporary back into the actual destination register. */
389 if (_mesa_num_inst_dst_regs(vpi
->Opcode
) != 0) {
390 emit(MOV(get_vp_dst_reg(vpi
->DstReg
), src_reg(dst
)));
394 /* If we used relative addressing, we need to upload all constants as
395 * pull constants. Do that now.
397 if (this->need_all_constants_in_pull_buffer
) {
398 const struct gl_program_parameter_list
*params
=
399 vs_compile
->vp
->program
.Base
.Parameters
;
401 for (i
= 0; i
< params
->NumParameters
* 4; i
++) {
402 stage_prog_data
->pull_param
[i
] =
403 ¶ms
->ParameterValues
[i
/ 4][i
% 4];
405 stage_prog_data
->nr_pull_params
= i
;
410 vec4_vs_visitor::setup_vp_regs()
412 /* PROGRAM_TEMPORARY */
413 int num_temp
= prog
->NumTemporaries
;
414 vp_temp_regs
= rzalloc_array(mem_ctx
, src_reg
, num_temp
);
415 for (int i
= 0; i
< num_temp
; i
++)
416 vp_temp_regs
[i
] = src_reg(this, glsl_type::vec4_type
);
418 /* PROGRAM_STATE_VAR etc. */
419 struct gl_program_parameter_list
*plist
=
420 vs_compile
->vp
->program
.Base
.Parameters
;
421 for (unsigned p
= 0; p
< plist
->NumParameters
; p
++) {
422 unsigned components
= plist
->Parameters
[p
].Size
;
424 /* Parameters should be either vec4 uniforms or single component
425 * constants; matrices and other larger types should have been broken
428 assert(components
<= 4);
430 this->uniform_size
[this->uniforms
] = 1; /* 1 vec4 */
431 this->uniform_vector_size
[this->uniforms
] = components
;
432 for (unsigned i
= 0; i
< 4; i
++) {
433 stage_prog_data
->param
[this->uniforms
* 4 + i
] = i
>= components
434 ? 0 : &plist
->ParameterValues
[p
][i
];
436 this->uniforms
++; /* counted in vec4 units */
440 for (int slot
= 0; slot
< prog_data
->vue_map
.num_slots
; slot
++) {
441 int varying
= prog_data
->vue_map
.slot_to_varying
[slot
];
442 if (varying
== VARYING_SLOT_PSIZ
)
443 output_reg
[varying
] = dst_reg(this, glsl_type::float_type
);
445 output_reg
[varying
] = dst_reg(this, glsl_type::vec4_type
);
446 assert(output_reg
[varying
].type
== BRW_REGISTER_TYPE_F
);
449 /* PROGRAM_ADDRESS */
450 this->vp_addr_reg
= src_reg(this, glsl_type::int_type
);
451 assert(this->vp_addr_reg
.type
== BRW_REGISTER_TYPE_D
);
455 vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register
&dst
)
459 assert(!dst
.RelAddr
);
462 case PROGRAM_TEMPORARY
:
463 result
= dst_reg(vp_temp_regs
[dst
.Index
]);
467 result
= output_reg
[dst
.Index
];
470 case PROGRAM_ADDRESS
: {
471 assert(dst
.Index
== 0);
472 result
= dst_reg(this->vp_addr_reg
);
476 case PROGRAM_UNDEFINED
:
480 unreachable("vec4_vp: bad destination register file");
483 result
.writemask
= dst
.WriteMask
;
488 vec4_vs_visitor::get_vp_src_reg(const prog_src_register
&src
)
490 struct gl_program_parameter_list
*plist
=
491 vs_compile
->vp
->program
.Base
.Parameters
;
498 case PROGRAM_UNDEFINED
:
499 return src_reg(brw_null_reg());
501 case PROGRAM_TEMPORARY
:
502 result
= vp_temp_regs
[src
.Index
];
506 result
= src_reg(ATTR
, src
.Index
, glsl_type::vec4_type
);
507 result
.type
= BRW_REGISTER_TYPE_F
;
510 case PROGRAM_ADDRESS
: {
511 assert(src
.Index
== 0);
512 result
= this->vp_addr_reg
;
516 case PROGRAM_STATE_VAR
:
517 case PROGRAM_CONSTANT
:
518 /* From the ARB_vertex_program specification:
519 * "Relative addressing can only be used for accessing program
523 /* Since we have no idea what the base of the array is, we need to
524 * upload ALL constants as push constants.
526 this->need_all_constants_in_pull_buffer
= true;
528 /* Add the small constant index to the address register */
529 src_reg reladdr
= src_reg(this, glsl_type::int_type
);
531 /* We have to use a message header on Skylake to get SIMD4x2 mode.
532 * Reserve space for the register.
535 reladdr
.reg_offset
++;
536 alloc
.sizes
[reladdr
.reg
] = 2;
539 dst_reg dst_reladdr
= dst_reg(reladdr
);
540 dst_reladdr
.writemask
= WRITEMASK_X
;
541 emit(ADD(dst_reladdr
, this->vp_addr_reg
, src_reg(src
.Index
)));
544 emit(MUL(dst_reladdr
, reladdr
, src_reg(16)));
547 assert(src
.Index
< this->uniforms
);
548 result
= src_reg(dst_reg(UNIFORM
, 0));
549 result
.type
= BRW_REGISTER_TYPE_F
;
550 result
.reladdr
= new(mem_ctx
) src_reg();
551 memcpy(result
.reladdr
, &reladdr
, sizeof(src_reg
));
554 result
= src_reg(this, glsl_type::vec4_type
);
555 src_reg surf_index
= src_reg(unsigned(prog_data
->base
.binding_table
.pull_constants_start
));
556 vec4_instruction
*load
;
559 vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7
,
560 dst_reg(result
), surf_index
, reladdr
);
564 vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD
,
565 dst_reg(result
), surf_index
, reladdr
);
573 /* We actually want to look at the type in the Parameters list for this,
574 * because this lets us upload constant builtin uniforms as actual
577 switch (plist
->Parameters
[src
.Index
].Type
) {
578 case PROGRAM_CONSTANT
:
579 result
= src_reg(this, glsl_type::vec4_type
);
580 for (int i
= 0; i
< 4; i
++) {
581 dst_reg t
= dst_reg(result
);
582 t
.writemask
= 1 << i
;
583 emit(MOV(t
, src_reg(plist
->ParameterValues
[src
.Index
][i
].f
)));
587 case PROGRAM_STATE_VAR
:
588 assert(src
.Index
< this->uniforms
);
589 result
= src_reg(dst_reg(UNIFORM
, src
.Index
));
590 result
.type
= BRW_REGISTER_TYPE_F
;
594 _mesa_problem(ctx
, "bad uniform src register file: %s\n",
595 _mesa_register_file_name((gl_register_file
)src
.File
));
596 return src_reg(this, glsl_type::vec4_type
);
601 _mesa_problem(ctx
, "bad src register file: %s\n",
602 _mesa_register_file_name((gl_register_file
)src
.File
));
603 return src_reg(this, glsl_type::vec4_type
);
606 if (src
.Swizzle
!= SWIZZLE_NOOP
|| src
.Negate
) {
607 unsigned short zeros_mask
= 0;
608 unsigned short ones_mask
= 0;
609 unsigned short src_mask
= 0;
610 unsigned short src_swiz
[4];
612 for (int i
= 0; i
< 4; i
++) {
613 src_swiz
[i
] = 0; /* initialize for safety */
615 /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
616 * but it's simplest to handle it here.
618 int s
= GET_SWZ(src
.Swizzle
, i
);
628 zeros_mask
|= 1 << i
;
637 BRW_SWIZZLE4(src_swiz
[0], src_swiz
[1], src_swiz
[2], src_swiz
[3]);
639 /* The hardware doesn't natively handle the SWZ instruction's zero/one
640 * swizzles or per-component negation, so we need to use a temporary.
642 if (zeros_mask
|| ones_mask
|| src
.Negate
) {
643 src_reg
temp_src(this, glsl_type::vec4_type
);
644 dst_reg
temp(temp_src
);
647 temp
.writemask
= src_mask
;
648 emit(MOV(temp
, result
));
652 temp
.writemask
= zeros_mask
;
653 emit(MOV(temp
, src_reg(0.0f
)));
657 temp
.writemask
= ones_mask
;
658 emit(MOV(temp
, src_reg(1.0f
)));
662 temp
.writemask
= src
.Negate
;
663 src_reg
neg(temp_src
);
665 emit(MOV(temp
, neg
));