1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
10 SUB_NOISE1
, SUB_NOISE2
, SUB_NOISE3
, SUB_NOISE4
13 static struct brw_reg
get_dst_reg(struct brw_wm_compile
*c
,
14 const struct prog_instruction
*inst
,
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
22 GLboolean
brw_wm_is_glsl(const struct gl_fragment_program
*fp
)
26 for (i
= 0; i
< fp
->Base
.NumInstructions
; i
++) {
27 const struct prog_instruction
*inst
= &fp
->Base
.Instructions
[i
];
28 switch (inst
->Opcode
) {
51 reclaim_temps(struct brw_wm_compile
*c
);
54 /** Mark GRF register as used. */
56 prealloc_grf(struct brw_wm_compile
*c
, int r
)
58 c
->used_grf
[r
] = GL_TRUE
;
62 /** Mark given GRF register as not in use. */
64 release_grf(struct brw_wm_compile
*c
, int r
)
66 /*assert(c->used_grf[r]);*/
67 c
->used_grf
[r
] = GL_FALSE
;
68 c
->first_free_grf
= MIN2(c
->first_free_grf
, r
);
72 /** Return index of a free GRF, mark it as used. */
74 alloc_grf(struct brw_wm_compile
*c
)
77 for (r
= c
->first_free_grf
; r
< BRW_WM_MAX_GRF
; r
++) {
78 if (!c
->used_grf
[r
]) {
79 c
->used_grf
[r
] = GL_TRUE
;
80 c
->first_free_grf
= r
+ 1; /* a guess */
85 /* no free temps, try to reclaim some */
87 c
->first_free_grf
= 0;
90 for (r
= c
->first_free_grf
; r
< BRW_WM_MAX_GRF
; r
++) {
91 if (!c
->used_grf
[r
]) {
92 c
->used_grf
[r
] = GL_TRUE
;
93 c
->first_free_grf
= r
+ 1; /* a guess */
98 for (r
= 0; r
< BRW_WM_MAX_GRF
; r
++) {
99 assert(c
->used_grf
[r
]);
102 /* really, no free GRF regs found */
103 if (!c
->out_of_regs
) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL
, "i965: ran out of registers for fragment program");
106 c
->out_of_regs
= GL_TRUE
;
113 /** Return number of GRF registers used */
115 num_grf_used(const struct brw_wm_compile
*c
)
118 for (r
= BRW_WM_MAX_GRF
- 1; r
>= 0; r
--)
127 * Record the mapping of a Mesa register to a hardware register.
129 static void set_reg(struct brw_wm_compile
*c
, int file
, int index
,
130 int component
, struct brw_reg reg
)
132 c
->wm_regs
[file
][index
][component
].reg
= reg
;
133 c
->wm_regs
[file
][index
][component
].inited
= GL_TRUE
;
136 static struct brw_reg
alloc_tmp(struct brw_wm_compile
*c
)
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c
->tmp_index
== c
->tmp_max
) {
142 int r
= alloc_grf(c
);
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r
= 50; /* XXX random register! */
147 c
->tmp_regs
[ c
->tmp_max
++ ] = r
;
150 /* form the GRF register */
151 reg
= brw_vec8_grf(c
->tmp_regs
[ c
->tmp_index
++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg
.nr
< BRW_WM_MAX_GRF
);
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
162 static int mark_tmps(struct brw_wm_compile
*c
)
167 static struct brw_reg
lookup_tmp( struct brw_wm_compile
*c
, int index
)
169 return brw_vec8_grf( c
->tmp_regs
[ index
], 0 );
172 static void release_tmps(struct brw_wm_compile
*c
, int mark
)
178 * Convert Mesa src register to brw register.
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
190 static struct brw_reg
191 get_reg(struct brw_wm_compile
*c
, int file
, int index
, int component
,
192 int nr
, GLuint neg
, GLuint abs
)
196 case PROGRAM_STATE_VAR
:
197 case PROGRAM_CONSTANT
:
198 case PROGRAM_UNIFORM
:
199 file
= PROGRAM_STATE_VAR
;
201 case PROGRAM_UNDEFINED
:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY
:
206 case PROGRAM_PAYLOAD
:
209 _mesa_problem(NULL
, "Unexpected file in get_reg()");
210 return brw_null_reg();
214 assert(component
< 4);
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c
->wm_regs
[file
][index
][component
].inited
) {
219 reg
= c
->wm_regs
[file
][index
][component
].reg
;
222 /* no, allocate new register */
223 int grf
= alloc_grf(c
);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
226 /* totally out of temps */
227 grf
= 51; /* XXX random register! */
230 reg
= brw_vec8_grf(grf
, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
233 set_reg(c
, file
, index
, component
, reg
);
236 if (neg
& (1 << component
)) {
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
251 reclaim_temps(struct brw_wm_compile
*c
)
253 GLint intBegin
[MAX_PROGRAM_TEMPS
];
254 GLint intEnd
[MAX_PROGRAM_TEMPS
];
257 /*printf("Reclaim temps:\n");*/
259 _mesa_find_temp_intervals(c
->prog_instructions
, c
->nr_fp_insns
,
262 for (index
= 0; index
< MAX_PROGRAM_TEMPS
; index
++) {
263 if (intEnd
[index
] != -1 && intEnd
[index
] < c
->cur_inst
) {
264 /* program temp[i] can be freed */
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component
= 0; component
< 4; component
++) {
268 if (c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].inited
) {
269 int r
= c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].reg
.nr
;
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
275 c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].inited
= GL_FALSE
;
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
290 static void prealloc_reg(struct brw_wm_compile
*c
)
294 int urb_read_length
= 0;
295 GLuint inputs
= FRAG_BIT_WPOS
| c
->fp_interp_emitted
;
296 GLuint reg_index
= 0;
298 memset(c
->used_grf
, GL_FALSE
, sizeof(c
->used_grf
));
299 c
->first_free_grf
= 0;
301 for (i
= 0; i
< 4; i
++) {
302 if (i
< c
->key
.nr_depth_regs
)
303 reg
= brw_vec8_grf(i
* 2, 0);
305 reg
= brw_vec8_grf(0, 0);
306 set_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
, i
, reg
);
308 reg_index
+= 2 * c
->key
.nr_depth_regs
;
312 const GLuint nr_params
= c
->fp
->program
.Base
.Parameters
->NumParameters
;
313 const GLuint nr_temps
= c
->fp
->program
.Base
.NumTemporaries
;
315 /* use a real constant buffer, or just use a section of the GRF? */
316 /* XXX this heuristic may need adjustment... */
317 if ((nr_params
+ nr_temps
) * 4 + reg_index
> 80)
318 c
->fp
->use_const_buffer
= GL_TRUE
;
320 c
->fp
->use_const_buffer
= GL_FALSE
;
321 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
323 if (c
->fp
->use_const_buffer
) {
324 /* We'll use a real constant buffer and fetch constants from
325 * it with a dataport read message.
328 /* number of float constants in CURBE */
329 c
->prog_data
.nr_params
= 0;
332 const struct gl_program_parameter_list
*plist
=
333 c
->fp
->program
.Base
.Parameters
;
336 /* number of float constants in CURBE */
337 c
->prog_data
.nr_params
= 4 * nr_params
;
339 /* loop over program constants (float[4]) */
340 for (i
= 0; i
< nr_params
; i
++) {
341 /* loop over XYZW channels */
342 for (j
= 0; j
< 4; j
++, index
++) {
343 reg
= brw_vec1_grf(reg_index
+ index
/ 8, index
% 8);
344 /* Save pointer to parameter/constant value.
345 * Constants will be copied in prepare_constant_buffer()
347 c
->prog_data
.param
[index
] = &plist
->ParameterValues
[i
][j
];
348 set_reg(c
, PROGRAM_STATE_VAR
, i
, j
, reg
);
351 /* number of constant regs used (each reg is float[8]) */
352 c
->nr_creg
= 2 * ((4 * nr_params
+ 15) / 16);
353 reg_index
+= c
->nr_creg
;
357 /* fragment shader inputs */
358 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
361 if (i
>= VERT_RESULT_VAR0
)
362 fp_input
= i
- VERT_RESULT_VAR0
+ FRAG_ATTRIB_VAR0
;
363 else if (i
<= VERT_RESULT_TEX7
)
368 if (fp_input
>= 0 && inputs
& (1 << fp_input
)) {
369 urb_read_length
= reg_index
;
370 reg
= brw_vec8_grf(reg_index
, 0);
371 for (j
= 0; j
< 4; j
++)
372 set_reg(c
, PROGRAM_PAYLOAD
, fp_input
, j
, reg
);
374 if (c
->key
.vp_outputs_written
& (1 << i
)) {
379 c
->prog_data
.first_curbe_grf
= c
->key
.nr_depth_regs
* 2;
380 c
->prog_data
.urb_read_length
= urb_read_length
;
381 c
->prog_data
.curb_read_length
= c
->nr_creg
;
382 c
->emit_mask_reg
= brw_uw1_reg(BRW_GENERAL_REGISTER_FILE
, reg_index
, 0);
384 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg_index
, 0);
387 /* mark GRF regs [0..reg_index-1] as in-use */
388 for (i
= 0; i
< reg_index
; i
++)
391 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
392 prealloc_grf(c
, 126);
393 prealloc_grf(c
, 127);
395 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
396 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
397 struct brw_reg dst
[4];
399 switch (inst
->Opcode
) {
402 /* Allocate the channels of texture results contiguously,
403 * since they are written out that way by the sampler unit.
405 for (j
= 0; j
< 4; j
++) {
406 dst
[j
] = get_dst_reg(c
, inst
, j
);
408 assert(dst
[j
].nr
== dst
[j
- 1].nr
+ 1);
416 /* An instruction may reference up to three constants.
417 * They'll be found in these registers.
418 * XXX alloc these on demand!
420 if (c
->fp
->use_const_buffer
) {
421 for (i
= 0; i
< 3; i
++) {
422 c
->current_const
[i
].index
= -1;
423 c
->current_const
[i
].reg
= brw_vec8_grf(alloc_grf(c
), 0);
427 printf("USE CONST BUFFER? %d\n", c
->fp
->use_const_buffer
);
428 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index
);
434 * Check if any of the instruction's src registers are constants, uniforms,
435 * or statevars. If so, fetch any constants that we don't already have in
436 * the three GRF slots.
438 static void fetch_constants(struct brw_wm_compile
*c
,
439 const struct prog_instruction
*inst
)
441 struct brw_compile
*p
= &c
->func
;
444 /* loop over instruction src regs */
445 for (i
= 0; i
< 3; i
++) {
446 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
447 if (src
->File
== PROGRAM_STATE_VAR
||
448 src
->File
== PROGRAM_CONSTANT
||
449 src
->File
== PROGRAM_UNIFORM
) {
450 c
->current_const
[i
].index
= src
->Index
;
453 printf(" fetch const[%d] for arg %d into reg %d\n",
454 src
->Index
, i
, c
->current_const
[i
].reg
.nr
);
457 /* need to fetch the constant now */
459 c
->current_const
[i
].reg
, /* writeback dest */
460 src
->RelAddr
, /* relative indexing? */
461 16 * src
->Index
, /* byte offset */
462 SURF_INDEX_FRAG_CONST_BUFFER
/* binding table index */
470 * Convert Mesa dst register to brw register.
472 static struct brw_reg
get_dst_reg(struct brw_wm_compile
*c
,
473 const struct prog_instruction
*inst
,
477 return get_reg(c
, inst
->DstReg
.File
, inst
->DstReg
.Index
, component
, nr
,
482 static struct brw_reg
483 get_src_reg_const(struct brw_wm_compile
*c
,
484 const struct prog_instruction
*inst
,
485 GLuint srcRegIndex
, GLuint component
)
487 /* We should have already fetched the constant from the constant
488 * buffer in fetch_constants(). Now we just have to return a
489 * register description that extracts the needed component and
490 * smears it across all eight vector components.
492 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
493 struct brw_reg const_reg
;
495 assert(component
< 4);
496 assert(srcRegIndex
< 3);
497 assert(c
->current_const
[srcRegIndex
].index
!= -1);
498 const_reg
= c
->current_const
[srcRegIndex
].reg
;
500 /* extract desired float from the const_reg, and smear */
501 const_reg
= stride(const_reg
, 0, 1, 0);
502 const_reg
.subnr
= component
* 4;
504 if (src
->Negate
& (1 << component
))
505 const_reg
= negate(const_reg
);
507 const_reg
= brw_abs(const_reg
);
510 printf(" form const[%d].%d for arg %d, reg %d\n",
511 c
->current_const
[srcRegIndex
].index
,
522 * Convert Mesa src register to brw register.
524 static struct brw_reg
get_src_reg(struct brw_wm_compile
*c
,
525 const struct prog_instruction
*inst
,
526 GLuint srcRegIndex
, GLuint channel
)
528 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
530 const GLuint component
= GET_SWZ(src
->Swizzle
, channel
);
532 /* Extended swizzle terms */
533 if (component
== SWIZZLE_ZERO
) {
534 return brw_imm_f(0.0F
);
536 else if (component
== SWIZZLE_ONE
) {
537 return brw_imm_f(1.0F
);
540 if (c
->fp
->use_const_buffer
&&
541 (src
->File
== PROGRAM_STATE_VAR
||
542 src
->File
== PROGRAM_CONSTANT
||
543 src
->File
== PROGRAM_UNIFORM
)) {
544 return get_src_reg_const(c
, inst
, srcRegIndex
, component
);
547 /* other type of source register */
548 return get_reg(c
, src
->File
, src
->Index
, component
, nr
,
549 src
->Negate
, src
->Abs
);
554 * Subroutines are minimal support for resusable instruction sequences.
555 * They are implemented as simply as possible to minimise overhead: there
556 * is no explicit support for communication between the caller and callee
557 * other than saving the return address in a temporary register, nor is
558 * there any automatic local storage. This implies that great care is
559 * required before attempting reentrancy or any kind of nested
560 * subroutine invocations.
562 static void invoke_subroutine( struct brw_wm_compile
*c
,
563 enum _subroutine subroutine
,
564 void (*emit
)( struct brw_wm_compile
* ) )
566 struct brw_compile
*p
= &c
->func
;
568 assert( subroutine
< BRW_WM_MAX_SUBROUTINE
);
570 if( c
->subroutines
[ subroutine
] ) {
571 /* subroutine previously emitted: reuse existing instructions */
573 int mark
= mark_tmps( c
);
574 struct brw_reg return_address
= retype( alloc_tmp( c
),
575 BRW_REGISTER_TYPE_UD
);
576 int here
= p
->nr_insn
;
578 brw_push_insn_state(p
);
579 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
580 brw_ADD( p
, return_address
, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
582 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(),
583 brw_imm_d( ( c
->subroutines
[ subroutine
] -
585 brw_pop_insn_state(p
);
587 release_tmps( c
, mark
);
589 /* previously unused subroutine: emit, and mark for later reuse */
591 int mark
= mark_tmps( c
);
592 struct brw_reg return_address
= retype( alloc_tmp( c
),
593 BRW_REGISTER_TYPE_UD
);
594 struct brw_instruction
*calc
;
595 int base
= p
->nr_insn
;
597 brw_push_insn_state(p
);
598 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
599 calc
= brw_ADD( p
, return_address
, brw_ip_reg(), brw_imm_ud( 0 ) );
600 brw_pop_insn_state(p
);
602 c
->subroutines
[ subroutine
] = p
->nr_insn
;
606 brw_push_insn_state(p
);
607 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
608 brw_MOV( p
, brw_ip_reg(), return_address
);
609 brw_pop_insn_state(p
);
611 brw_set_src1( calc
, brw_imm_ud( ( p
->nr_insn
- base
) << 4 ) );
613 release_tmps( c
, mark
);
617 /* Workaround for using brw_wm_emit.c's emit functions, which expect
618 * destination regs to be uniquely written. Moves arguments out to
619 * temporaries as necessary for instructions which use their destination as
623 unalias3(struct brw_wm_compile
*c
,
624 void (*func
)(struct brw_compile
*c
,
625 const struct brw_reg
*dst
,
627 const struct brw_reg
*arg0
,
628 const struct brw_reg
*arg1
,
629 const struct brw_reg
*arg2
),
630 const struct brw_reg
*dst
,
632 const struct brw_reg
*arg0
,
633 const struct brw_reg
*arg1
,
634 const struct brw_reg
*arg2
)
636 struct brw_compile
*p
= &c
->func
;
637 struct brw_reg tmp_arg0
[4], tmp_arg1
[4], tmp_arg2
[4];
639 int mark
= mark_tmps(c
);
641 for (j
= 0; j
< 4; j
++) {
642 tmp_arg0
[j
] = arg0
[j
];
643 tmp_arg1
[j
] = arg1
[j
];
644 tmp_arg2
[j
] = arg2
[j
];
647 for (i
= 0; i
< 4; i
++) {
649 for (j
= 0; j
< 4; j
++) {
650 if (arg0
[j
].file
== dst
[i
].file
&&
651 dst
[i
].nr
== arg0
[j
].nr
) {
652 tmp_arg0
[j
] = alloc_tmp(c
);
653 brw_MOV(p
, tmp_arg0
[j
], arg0
[j
]);
655 if (arg1
[j
].file
== dst
[i
].file
&&
656 dst
[i
].nr
== arg1
[j
].nr
) {
657 tmp_arg1
[j
] = alloc_tmp(c
);
658 brw_MOV(p
, tmp_arg1
[j
], arg1
[j
]);
660 if (arg2
[j
].file
== dst
[i
].file
&&
661 dst
[i
].nr
== arg2
[j
].nr
) {
662 tmp_arg2
[j
] = alloc_tmp(c
);
663 brw_MOV(p
, tmp_arg2
[j
], arg2
[j
]);
669 func(p
, dst
, mask
, tmp_arg0
, tmp_arg1
, tmp_arg2
);
671 release_tmps(c
, mark
);
674 static void fire_fb_write( struct brw_wm_compile
*c
,
680 struct brw_compile
*p
= &c
->func
;
681 /* Pass through control information:
683 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
685 brw_push_insn_state(p
);
686 brw_set_mask_control(p
, BRW_MASK_DISABLE
); /* ? */
688 brw_message_reg(base_reg
+ 1),
690 brw_pop_insn_state(p
);
692 /* Send framebuffer write message: */
694 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
),
696 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW
),
703 static void emit_fb_write(struct brw_wm_compile
*c
,
704 const struct prog_instruction
*inst
)
706 struct brw_compile
*p
= &c
->func
;
712 /* Reserve a space for AA - may not be needed:
714 if (c
->key
.aa_dest_stencil_reg
)
717 brw_push_insn_state(p
);
718 for (channel
= 0; channel
< 4; channel
++) {
719 src0
= get_src_reg(c
, inst
, 0, channel
);
720 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
721 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
722 brw_MOV(p
, brw_message_reg(nr
+ channel
), src0
);
724 /* skip over the regs populated above: */
726 brw_pop_insn_state(p
);
728 if (c
->key
.source_depth_to_render_target
) {
729 if (c
->key
.computes_depth
) {
730 src0
= get_src_reg(c
, inst
, 2, 2);
731 brw_MOV(p
, brw_message_reg(nr
), src0
);
734 src0
= get_src_reg(c
, inst
, 1, 1);
735 brw_MOV(p
, brw_message_reg(nr
), src0
);
741 if (c
->key
.dest_depth_reg
) {
742 const GLuint comp
= c
->key
.dest_depth_reg
/ 2;
743 const GLuint off
= c
->key
.dest_depth_reg
% 2;
746 /* XXX this code needs review/testing */
747 struct brw_reg arg1_0
= get_src_reg(c
, inst
, 1, comp
);
748 struct brw_reg arg1_1
= get_src_reg(c
, inst
, 1, comp
+1);
750 brw_push_insn_state(p
);
751 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
753 brw_MOV(p
, brw_message_reg(nr
), offset(arg1_0
, 1));
755 brw_MOV(p
, brw_message_reg(nr
+1), arg1_1
);
756 brw_pop_insn_state(p
);
760 struct brw_reg src
= get_src_reg(c
, inst
, 1, 1);
761 brw_MOV(p
, brw_message_reg(nr
), src
);
766 target
= INST_AUX_GET_TARGET(inst
->Aux
);
767 eot
= inst
->Aux
& INST_AUX_EOT
;
768 fire_fb_write(c
, 0, nr
, target
, eot
);
771 static void emit_arl(struct brw_wm_compile
*c
,
772 const struct prog_instruction
*inst
)
774 struct brw_compile
*p
= &c
->func
;
775 struct brw_reg src0
, addr_reg
;
776 brw_set_saturate(p
, (inst
->SaturateMode
!= SATURATE_OFF
) ? 1 : 0);
777 addr_reg
= brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
779 src0
= get_src_reg(c
, inst
, 0, 0); /* channel 0 */
780 brw_MOV(p
, addr_reg
, src0
);
781 brw_set_saturate(p
, 0);
785 static void emit_min_max(struct brw_wm_compile
*c
,
786 const struct prog_instruction
*inst
)
788 struct brw_compile
*p
= &c
->func
;
789 const GLuint mask
= inst
->DstReg
.WriteMask
;
790 const int mark
= mark_tmps(c
);
792 brw_push_insn_state(p
);
793 for (i
= 0; i
< 4; i
++) {
795 struct brw_reg real_dst
= get_dst_reg(c
, inst
, i
);
796 struct brw_reg src0
= get_src_reg(c
, inst
, 0, i
);
797 struct brw_reg src1
= get_src_reg(c
, inst
, 1, i
);
799 /* if dst==src0 or dst==src1 we need to use a temp reg */
800 GLboolean use_temp
= brw_same_reg(dst
, src0
) ||
801 brw_same_reg(dst
, src1
);
808 printf(" Min/max: dst %d src0 %d src1 %d\n",
809 dst.nr, src0.nr, src1.nr);
811 brw_set_saturate(p
, (inst
->SaturateMode
!= SATURATE_OFF
) ? 1 : 0);
812 brw_MOV(p
, dst
, src0
);
813 brw_set_saturate(p
, 0);
815 if (inst
->Opcode
== OPCODE_MIN
)
816 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, src1
, src0
);
818 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, src1
, src0
);
820 brw_set_saturate(p
, (inst
->SaturateMode
!= SATURATE_OFF
) ? 1 : 0);
821 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
822 brw_MOV(p
, dst
, src1
);
823 brw_set_saturate(p
, 0);
824 brw_set_predicate_control_flag_value(p
, 0xff);
826 brw_MOV(p
, real_dst
, dst
);
829 brw_pop_insn_state(p
);
830 release_tmps(c
, mark
);
834 * For GLSL shaders, this KIL will be unconditional.
835 * It may be contained inside an IF/ENDIF structure of course.
837 static void emit_kil(struct brw_wm_compile
*c
)
839 struct brw_compile
*p
= &c
->func
;
840 struct brw_reg depth
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
841 brw_push_insn_state(p
);
842 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
843 brw_NOT(p
, c
->emit_mask_reg
, brw_mask_reg(1)); //IMASK
844 brw_AND(p
, depth
, c
->emit_mask_reg
, depth
);
845 brw_pop_insn_state(p
);
848 static INLINE
struct brw_reg
high_words( struct brw_reg reg
)
850 return stride( suboffset( retype( reg
, BRW_REGISTER_TYPE_W
), 1 ),
854 static INLINE
struct brw_reg
low_words( struct brw_reg reg
)
856 return stride( retype( reg
, BRW_REGISTER_TYPE_W
), 0, 8, 2 );
859 static INLINE
struct brw_reg
even_bytes( struct brw_reg reg
)
861 return stride( retype( reg
, BRW_REGISTER_TYPE_B
), 0, 16, 2 );
864 static INLINE
struct brw_reg
odd_bytes( struct brw_reg reg
)
866 return stride( suboffset( retype( reg
, BRW_REGISTER_TYPE_B
), 1 ),
870 /* One-, two- and three-dimensional Perlin noise, similar to the description
871 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
872 static void noise1_sub( struct brw_wm_compile
*c
) {
874 struct brw_compile
*p
= &c
->func
;
875 struct brw_reg param
,
876 x0
, x1
, /* gradients at each end */
877 t
, tmp
[ 2 ], /* float temporaries */
878 itmp
[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
880 int mark
= mark_tmps( c
);
885 tmp
[ 0 ] = alloc_tmp( c
);
886 tmp
[ 1 ] = alloc_tmp( c
);
887 itmp
[ 0 ] = retype( tmp
[ 0 ], BRW_REGISTER_TYPE_UD
);
888 itmp
[ 1 ] = retype( tmp
[ 1 ], BRW_REGISTER_TYPE_UD
);
889 itmp
[ 2 ] = retype( x0
, BRW_REGISTER_TYPE_UD
);
890 itmp
[ 3 ] = retype( x1
, BRW_REGISTER_TYPE_UD
);
891 itmp
[ 4 ] = retype( t
, BRW_REGISTER_TYPE_UD
);
893 param
= lookup_tmp( c
, mark
- 2 );
895 brw_set_access_mode( p
, BRW_ALIGN_1
);
897 brw_MOV( p
, itmp
[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
899 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
900 be hashed. Also compute the remainder (offset within the unit
901 length), interleaved to reduce register dependency penalties. */
902 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param
);
903 brw_FRC( p
, param
, param
);
904 brw_ADD( p
, itmp
[ 1 ], itmp
[ 0 ], brw_imm_ud( 1 ) );
905 brw_MOV( p
, itmp
[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
906 brw_MOV( p
, itmp
[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
908 /* We're now ready to perform the hashing. The two hashes are
909 interleaved for performance. The hash function used is
910 designed to rapidly achieve avalanche and require only 32x16
911 bit multiplication, and 16-bit swizzles (which we get for
912 free). We can't use immediate operands in the multiplies,
913 because immediates are permitted only in src1 and the 16-bit
914 factor is permitted only in src0. */
915 for( i
= 0; i
< 2; i
++ )
916 brw_MUL( p
, itmp
[ i
], itmp
[ 2 ], itmp
[ i
] );
917 for( i
= 0; i
< 2; i
++ )
918 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
919 high_words( itmp
[ i
] ) );
920 for( i
= 0; i
< 2; i
++ )
921 brw_MUL( p
, itmp
[ i
], itmp
[ 3 ], itmp
[ i
] );
922 for( i
= 0; i
< 2; i
++ )
923 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
924 high_words( itmp
[ i
] ) );
925 for( i
= 0; i
< 2; i
++ )
926 brw_MUL( p
, itmp
[ i
], itmp
[ 4 ], itmp
[ i
] );
927 for( i
= 0; i
< 2; i
++ )
928 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
929 high_words( itmp
[ i
] ) );
931 /* Now we want to initialise the two gradients based on the
932 hashes. Format conversion from signed integer to float leaves
933 everything scaled too high by a factor of pow( 2, 31 ), but
934 we correct for that right at the end. */
935 brw_ADD( p
, t
, param
, brw_imm_f( -1.0 ) );
936 brw_MOV( p
, x0
, retype( tmp
[ 0 ], BRW_REGISTER_TYPE_D
) );
937 brw_MOV( p
, x1
, retype( tmp
[ 1 ], BRW_REGISTER_TYPE_D
) );
939 brw_MUL( p
, x0
, x0
, param
);
940 brw_MUL( p
, x1
, x1
, t
);
942 /* We interpolate between the gradients using the polynomial
943 6t^5 - 15t^4 + 10t^3 (Perlin). */
944 brw_MUL( p
, tmp
[ 0 ], param
, brw_imm_f( 6.0 ) );
945 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( -15.0 ) );
946 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
947 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( 10.0 ) );
948 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
949 brw_ADD( p
, x1
, x1
, negate( x0
) ); /* unrelated work to fill the
951 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
952 brw_MUL( p
, param
, tmp
[ 0 ], param
);
953 brw_MUL( p
, x1
, x1
, param
);
954 brw_ADD( p
, x0
, x0
, x1
);
955 /* scale by pow( 2, -30 ), to compensate for the format conversion
956 above and an extra factor of 2 so that a single gradient covers
958 brw_MUL( p
, param
, x0
, brw_imm_f( 0.000000000931322574615478515625 ) );
960 release_tmps( c
, mark
);
963 static void emit_noise1( struct brw_wm_compile
*c
,
964 const struct prog_instruction
*inst
)
966 struct brw_compile
*p
= &c
->func
;
967 struct brw_reg src
, param
, dst
;
968 GLuint mask
= inst
->DstReg
.WriteMask
;
970 int mark
= mark_tmps( c
);
974 src
= get_src_reg( c
, inst
, 0, 0 );
976 param
= alloc_tmp( c
);
978 brw_MOV( p
, param
, src
);
980 invoke_subroutine( c
, SUB_NOISE1
, noise1_sub
);
982 /* Fill in the result: */
983 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
984 for (i
= 0 ; i
< 4; i
++) {
986 dst
= get_dst_reg(c
, inst
, i
);
987 brw_MOV( p
, dst
, param
);
990 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
991 brw_set_saturate( p
, 0 );
993 release_tmps( c
, mark
);
996 static void noise2_sub( struct brw_wm_compile
*c
) {
998 struct brw_compile
*p
= &c
->func
;
999 struct brw_reg param0
, param1
,
1000 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at each corner */
1001 t
, tmp
[ 4 ], /* float temporaries */
1002 itmp
[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1004 int mark
= mark_tmps( c
);
1006 x0y0
= alloc_tmp( c
);
1007 x0y1
= alloc_tmp( c
);
1008 x1y0
= alloc_tmp( c
);
1009 x1y1
= alloc_tmp( c
);
1011 for( i
= 0; i
< 4; i
++ ) {
1012 tmp
[ i
] = alloc_tmp( c
);
1013 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1015 itmp
[ 4 ] = retype( x0y0
, BRW_REGISTER_TYPE_UD
);
1016 itmp
[ 5 ] = retype( x0y1
, BRW_REGISTER_TYPE_UD
);
1017 itmp
[ 6 ] = retype( x1y0
, BRW_REGISTER_TYPE_UD
);
1019 param0
= lookup_tmp( c
, mark
- 3 );
1020 param1
= lookup_tmp( c
, mark
- 2 );
1022 brw_set_access_mode( p
, BRW_ALIGN_1
);
1024 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1025 be hashed. Also compute the remainders (offsets within the unit
1026 square), interleaved to reduce register dependency penalties. */
1027 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param0
);
1028 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param1
);
1029 brw_FRC( p
, param0
, param0
);
1030 brw_FRC( p
, param1
, param1
);
1031 brw_MOV( p
, itmp
[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1032 brw_ADD( p
, high_words( itmp
[ 0 ] ), high_words( itmp
[ 0 ] ),
1033 low_words( itmp
[ 1 ] ) );
1034 brw_MOV( p
, itmp
[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1035 brw_MOV( p
, itmp
[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1036 brw_ADD( p
, itmp
[ 1 ], itmp
[ 0 ], brw_imm_ud( 0x10000 ) );
1037 brw_ADD( p
, itmp
[ 2 ], itmp
[ 0 ], brw_imm_ud( 0x1 ) );
1038 brw_ADD( p
, itmp
[ 3 ], itmp
[ 0 ], brw_imm_ud( 0x10001 ) );
1040 /* We're now ready to perform the hashing. The four hashes are
1041 interleaved for performance. The hash function used is
1042 designed to rapidly achieve avalanche and require only 32x16
1043 bit multiplication, and 16-bit swizzles (which we get for
1044 free). We can't use immediate operands in the multiplies,
1045 because immediates are permitted only in src1 and the 16-bit
1046 factor is permitted only in src0. */
1047 for( i
= 0; i
< 4; i
++ )
1048 brw_MUL( p
, itmp
[ i
], itmp
[ 4 ], itmp
[ i
] );
1049 for( i
= 0; i
< 4; i
++ )
1050 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1051 high_words( itmp
[ i
] ) );
1052 for( i
= 0; i
< 4; i
++ )
1053 brw_MUL( p
, itmp
[ i
], itmp
[ 5 ], itmp
[ i
] );
1054 for( i
= 0; i
< 4; i
++ )
1055 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1056 high_words( itmp
[ i
] ) );
1057 for( i
= 0; i
< 4; i
++ )
1058 brw_MUL( p
, itmp
[ i
], itmp
[ 6 ], itmp
[ i
] );
1059 for( i
= 0; i
< 4; i
++ )
1060 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1061 high_words( itmp
[ i
] ) );
1063 /* Now we want to initialise the four gradients based on the
1064 hashes. Format conversion from signed integer to float leaves
1065 everything scaled too high by a factor of pow( 2, 15 ), but
1066 we correct for that right at the end. */
1067 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1068 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1069 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1070 brw_MOV( p
, x1y0
, low_words( tmp
[ 2 ] ) );
1071 brw_MOV( p
, x1y1
, low_words( tmp
[ 3 ] ) );
1073 brw_MOV( p
, tmp
[ 0 ], high_words( tmp
[ 0 ] ) );
1074 brw_MOV( p
, tmp
[ 1 ], high_words( tmp
[ 1 ] ) );
1075 brw_MOV( p
, tmp
[ 2 ], high_words( tmp
[ 2 ] ) );
1076 brw_MOV( p
, tmp
[ 3 ], high_words( tmp
[ 3 ] ) );
1078 brw_MUL( p
, x1y0
, x1y0
, t
);
1079 brw_MUL( p
, x1y1
, x1y1
, t
);
1080 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1081 brw_MUL( p
, x0y0
, x0y0
, param0
);
1082 brw_MUL( p
, x0y1
, x0y1
, param0
);
1084 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param1
);
1085 brw_MUL( p
, tmp
[ 2 ], tmp
[ 2 ], param1
);
1086 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], t
);
1087 brw_MUL( p
, tmp
[ 3 ], tmp
[ 3 ], t
);
1089 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 0 ] );
1090 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 2 ] );
1091 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 1 ] );
1092 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 3 ] );
1094 /* We interpolate between the gradients using the polynomial
1095 6t^5 - 15t^4 + 10t^3 (Perlin). */
1096 brw_MUL( p
, tmp
[ 0 ], param0
, brw_imm_f( 6.0 ) );
1097 brw_MUL( p
, tmp
[ 1 ], param1
, brw_imm_f( 6.0 ) );
1098 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( -15.0 ) );
1099 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], brw_imm_f( -15.0 ) );
1100 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
1101 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
1102 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) ); /* unrelated work to fill the
1104 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( 10.0 ) );
1105 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], brw_imm_f( 10.0 ) );
1106 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
1107 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
1108 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) ); /* unrelated work to fill the
1110 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
1111 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
1112 brw_MUL( p
, param0
, tmp
[ 0 ], param0
);
1113 brw_MUL( p
, param1
, tmp
[ 1 ], param1
);
1115 /* Here we interpolate in the y dimension... */
1116 brw_MUL( p
, x0y1
, x0y1
, param1
);
1117 brw_MUL( p
, x1y1
, x1y1
, param1
);
1118 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1119 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1121 /* And now in x. There are horrible register dependencies here,
1122 but we have nothing else to do. */
1123 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1124 brw_MUL( p
, x1y0
, x1y0
, param0
);
1125 brw_ADD( p
, x0y0
, x0y0
, x1y0
);
1127 /* scale by pow( 2, -15 ), as described above */
1128 brw_MUL( p
, param0
, x0y0
, brw_imm_f( 0.000030517578125 ) );
1130 release_tmps( c
, mark
);
1133 static void emit_noise2( struct brw_wm_compile
*c
,
1134 const struct prog_instruction
*inst
)
1136 struct brw_compile
*p
= &c
->func
;
1137 struct brw_reg src0
, src1
, param0
, param1
, dst
;
1138 GLuint mask
= inst
->DstReg
.WriteMask
;
1140 int mark
= mark_tmps( c
);
1142 assert( mark
== 0 );
1144 src0
= get_src_reg( c
, inst
, 0, 0 );
1145 src1
= get_src_reg( c
, inst
, 0, 1 );
1147 param0
= alloc_tmp( c
);
1148 param1
= alloc_tmp( c
);
1150 brw_MOV( p
, param0
, src0
);
1151 brw_MOV( p
, param1
, src1
);
1153 invoke_subroutine( c
, SUB_NOISE2
, noise2_sub
);
1155 /* Fill in the result: */
1156 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1157 for (i
= 0 ; i
< 4; i
++) {
1158 if (mask
& (1<<i
)) {
1159 dst
= get_dst_reg(c
, inst
, i
);
1160 brw_MOV( p
, dst
, param0
);
1163 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1164 brw_set_saturate( p
, 0 );
1166 release_tmps( c
, mark
);
1170 * The three-dimensional case is much like the one- and two- versions above,
1171 * but since the number of corners is rapidly growing we now pack 16 16-bit
1172 * hashes into each register to extract more parallelism from the EUs.
1174 static void noise3_sub( struct brw_wm_compile
*c
) {
1176 struct brw_compile
*p
= &c
->func
;
1177 struct brw_reg param0
, param1
, param2
,
1178 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at four of the corners */
1179 xi
, yi
, zi
, /* interpolation coefficients */
1180 t
, tmp
[ 8 ], /* float temporaries */
1181 itmp
[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1182 wtmp
[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1184 int mark
= mark_tmps( c
);
1186 x0y0
= alloc_tmp( c
);
1187 x0y1
= alloc_tmp( c
);
1188 x1y0
= alloc_tmp( c
);
1189 x1y1
= alloc_tmp( c
);
1190 xi
= alloc_tmp( c
);
1191 yi
= alloc_tmp( c
);
1192 zi
= alloc_tmp( c
);
1194 for( i
= 0; i
< 8; i
++ ) {
1195 tmp
[ i
] = alloc_tmp( c
);
1196 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1197 wtmp
[ i
] = brw_uw16_grf( tmp
[ i
].nr
, 0 );
1200 param0
= lookup_tmp( c
, mark
- 4 );
1201 param1
= lookup_tmp( c
, mark
- 3 );
1202 param2
= lookup_tmp( c
, mark
- 2 );
1204 brw_set_access_mode( p
, BRW_ALIGN_1
);
1206 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1207 be hashed. Also compute the remainders (offsets within the unit
1208 cube), interleaved to reduce register dependency penalties. */
1209 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param0
);
1210 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param1
);
1211 brw_RNDD( p
, retype( itmp
[ 2 ], BRW_REGISTER_TYPE_D
), param2
);
1212 brw_FRC( p
, param0
, param0
);
1213 brw_FRC( p
, param1
, param1
);
1214 brw_FRC( p
, param2
, param2
);
1215 /* Since we now have only 16 bits of precision in the hash, we must
1216 be more careful about thorough mixing to maintain entropy as we
1217 squash the input vector into a small scalar. */
1218 brw_MUL( p
, brw_null_reg(), low_words( itmp
[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1219 brw_MAC( p
, brw_null_reg(), low_words( itmp
[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1220 brw_MAC( p
, low_words( itmp
[ 0 ] ), low_words( itmp
[ 2 ] ),
1221 brw_imm_uw( 0x9B93 ) );
1222 brw_ADD( p
, high_words( itmp
[ 0 ] ), low_words( itmp
[ 0 ] ),
1223 brw_imm_uw( 0xBC8F ) );
1225 /* Temporarily disable the execution mask while we work with ExecSize=16
1226 channels (the mask is set for ExecSize=8 and is probably incorrect).
1227 Although this might cause execution of unwanted channels, the code
1228 writes only to temporary registers and has no side effects, so
1229 disabling the mask is harmless. */
1230 brw_push_insn_state( p
);
1231 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1232 brw_ADD( p
, wtmp
[ 1 ], wtmp
[ 0 ], brw_imm_uw( 0xD0BD ) );
1233 brw_ADD( p
, wtmp
[ 2 ], wtmp
[ 0 ], brw_imm_uw( 0x9B93 ) );
1234 brw_ADD( p
, wtmp
[ 3 ], wtmp
[ 1 ], brw_imm_uw( 0x9B93 ) );
1236 /* We're now ready to perform the hashing. The eight hashes are
1237 interleaved for performance. The hash function used is
1238 designed to rapidly achieve avalanche and require only 16x16
1239 bit multiplication, and 8-bit swizzles (which we get for
1241 for( i
= 0; i
< 4; i
++ )
1242 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0x28D9 ) );
1243 for( i
= 0; i
< 4; i
++ )
1244 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1245 odd_bytes( wtmp
[ i
] ) );
1246 for( i
= 0; i
< 4; i
++ )
1247 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0xC6D5 ) );
1248 for( i
= 0; i
< 4; i
++ )
1249 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1250 odd_bytes( wtmp
[ i
] ) );
1251 brw_pop_insn_state( p
);
1253 /* Now we want to initialise the four rear gradients based on the
1254 hashes. Format conversion from signed integer to float leaves
1255 everything scaled too high by a factor of pow( 2, 15 ), but
1256 we correct for that right at the end. */
1258 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1259 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1260 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1261 brw_MOV( p
, x1y0
, high_words( tmp
[ 0 ] ) );
1262 brw_MOV( p
, x1y1
, high_words( tmp
[ 1 ] ) );
1264 brw_push_insn_state( p
);
1265 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1266 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 5 ) );
1267 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 5 ) );
1268 brw_pop_insn_state( p
);
1270 brw_MUL( p
, x1y0
, x1y0
, t
);
1271 brw_MUL( p
, x1y1
, x1y1
, t
);
1272 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1273 brw_MUL( p
, x0y0
, x0y0
, param0
);
1274 brw_MUL( p
, x0y1
, x0y1
, param0
);
1277 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1278 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1279 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1280 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1282 brw_push_insn_state( p
);
1283 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1284 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 5 ) );
1285 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 5 ) );
1286 brw_pop_insn_state( p
);
1288 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1289 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1290 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1291 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param1
);
1292 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param1
);
1294 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1295 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1296 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1297 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1300 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1301 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1302 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1303 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1305 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param2
);
1306 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], param2
);
1307 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param2
);
1308 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], param2
);
1310 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1311 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1312 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1313 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1315 /* We interpolate between the gradients using the polynomial
1316 6t^5 - 15t^4 + 10t^3 (Perlin). */
1317 brw_MUL( p
, xi
, param0
, brw_imm_f( 6.0 ) );
1318 brw_MUL( p
, yi
, param1
, brw_imm_f( 6.0 ) );
1319 brw_MUL( p
, zi
, param2
, brw_imm_f( 6.0 ) );
1320 brw_ADD( p
, xi
, xi
, brw_imm_f( -15.0 ) );
1321 brw_ADD( p
, yi
, yi
, brw_imm_f( -15.0 ) );
1322 brw_ADD( p
, zi
, zi
, brw_imm_f( -15.0 ) );
1323 brw_MUL( p
, xi
, xi
, param0
);
1324 brw_MUL( p
, yi
, yi
, param1
);
1325 brw_MUL( p
, zi
, zi
, param2
);
1326 brw_ADD( p
, xi
, xi
, brw_imm_f( 10.0 ) );
1327 brw_ADD( p
, yi
, yi
, brw_imm_f( 10.0 ) );
1328 brw_ADD( p
, zi
, zi
, brw_imm_f( 10.0 ) );
1329 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) ); /* unrelated work */
1330 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) ); /* unrelated work */
1331 brw_MUL( p
, xi
, xi
, param0
);
1332 brw_MUL( p
, yi
, yi
, param1
);
1333 brw_MUL( p
, zi
, zi
, param2
);
1334 brw_MUL( p
, xi
, xi
, param0
);
1335 brw_MUL( p
, yi
, yi
, param1
);
1336 brw_MUL( p
, zi
, zi
, param2
);
1337 brw_MUL( p
, xi
, xi
, param0
);
1338 brw_MUL( p
, yi
, yi
, param1
);
1339 brw_MUL( p
, zi
, zi
, param2
);
1341 /* Here we interpolate in the y dimension... */
1342 brw_MUL( p
, x0y1
, x0y1
, yi
);
1343 brw_MUL( p
, x1y1
, x1y1
, yi
);
1344 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1345 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1347 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1348 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1349 brw_MUL( p
, x1y0
, x1y0
, xi
);
1350 brw_ADD( p
, tmp
[ 0 ], x0y0
, x1y0
);
1352 /* Now do the same thing for the front four gradients... */
1354 brw_MOV( p
, x0y0
, low_words( tmp
[ 2 ] ) );
1355 brw_MOV( p
, x0y1
, low_words( tmp
[ 3 ] ) );
1356 brw_MOV( p
, x1y0
, high_words( tmp
[ 2 ] ) );
1357 brw_MOV( p
, x1y1
, high_words( tmp
[ 3 ] ) );
1359 brw_push_insn_state( p
);
1360 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1361 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 5 ) );
1362 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 5 ) );
1363 brw_pop_insn_state( p
);
1365 brw_MUL( p
, x1y0
, x1y0
, t
);
1366 brw_MUL( p
, x1y1
, x1y1
, t
);
1367 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1368 brw_MUL( p
, x0y0
, x0y0
, param0
);
1369 brw_MUL( p
, x0y1
, x0y1
, param0
);
1372 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1373 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1374 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1375 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1377 brw_push_insn_state( p
);
1378 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1379 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 5 ) );
1380 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 5 ) );
1381 brw_pop_insn_state( p
);
1383 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1384 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1385 brw_ADD( p
, t
, param2
, brw_imm_f( -1.0 ) );
1386 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param1
);
1387 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param1
);
1389 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1390 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1391 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1392 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1395 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1396 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1397 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1398 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1400 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1401 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1402 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1403 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1405 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1406 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1407 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1408 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1410 /* The interpolation coefficients are still around from last time, so
1411 again interpolate in the y dimension... */
1412 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1413 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1414 brw_MUL( p
, x0y1
, x0y1
, yi
);
1415 brw_MUL( p
, x1y1
, x1y1
, yi
);
1416 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1417 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1419 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1420 time put the front face in tmp[ 1 ] and we're nearly there... */
1421 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1422 brw_MUL( p
, x1y0
, x1y0
, xi
);
1423 brw_ADD( p
, tmp
[ 1 ], x0y0
, x1y0
);
1425 /* The final interpolation, in the z dimension: */
1426 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], negate( tmp
[ 0 ] ) );
1427 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], zi
);
1428 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], tmp
[ 1 ] );
1430 /* scale by pow( 2, -15 ), as described above */
1431 brw_MUL( p
, param0
, tmp
[ 0 ], brw_imm_f( 0.000030517578125 ) );
1433 release_tmps( c
, mark
);
1436 static void emit_noise3( struct brw_wm_compile
*c
,
1437 const struct prog_instruction
*inst
)
1439 struct brw_compile
*p
= &c
->func
;
1440 struct brw_reg src0
, src1
, src2
, param0
, param1
, param2
, dst
;
1441 GLuint mask
= inst
->DstReg
.WriteMask
;
1443 int mark
= mark_tmps( c
);
1445 assert( mark
== 0 );
1447 src0
= get_src_reg( c
, inst
, 0, 0 );
1448 src1
= get_src_reg( c
, inst
, 0, 1 );
1449 src2
= get_src_reg( c
, inst
, 0, 2 );
1451 param0
= alloc_tmp( c
);
1452 param1
= alloc_tmp( c
);
1453 param2
= alloc_tmp( c
);
1455 brw_MOV( p
, param0
, src0
);
1456 brw_MOV( p
, param1
, src1
);
1457 brw_MOV( p
, param2
, src2
);
1459 invoke_subroutine( c
, SUB_NOISE3
, noise3_sub
);
1461 /* Fill in the result: */
1462 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1463 for (i
= 0 ; i
< 4; i
++) {
1464 if (mask
& (1<<i
)) {
1465 dst
= get_dst_reg(c
, inst
, i
);
1466 brw_MOV( p
, dst
, param0
);
1469 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1470 brw_set_saturate( p
, 0 );
1472 release_tmps( c
, mark
);
1476 * For the four-dimensional case, the little micro-optimisation benefits
1477 * we obtain by unrolling all the loops aren't worth the massive bloat it
1478 * now causes. Instead, we loop twice around performing a similar operation
1479 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1480 * code to glue it all together.
1482 static void noise4_sub( struct brw_wm_compile
*c
)
1484 struct brw_compile
*p
= &c
->func
;
1485 struct brw_reg param
[ 4 ],
1486 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at four of the corners */
1487 w0
, /* noise for the w=0 cube */
1488 floors
[ 2 ], /* integer coordinates of base corner of hypercube */
1489 interp
[ 4 ], /* interpolation coefficients */
1490 t
, tmp
[ 8 ], /* float temporaries */
1491 itmp
[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1492 wtmp
[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1494 int mark
= mark_tmps( c
);
1495 GLuint loop
, origin
;
1497 x0y0
= alloc_tmp( c
);
1498 x0y1
= alloc_tmp( c
);
1499 x1y0
= alloc_tmp( c
);
1500 x1y1
= alloc_tmp( c
);
1502 w0
= alloc_tmp( c
);
1503 floors
[ 0 ] = retype( alloc_tmp( c
), BRW_REGISTER_TYPE_UD
);
1504 floors
[ 1 ] = retype( alloc_tmp( c
), BRW_REGISTER_TYPE_UD
);
1506 for( i
= 0; i
< 4; i
++ ) {
1507 param
[ i
] = lookup_tmp( c
, mark
- 5 + i
);
1508 interp
[ i
] = alloc_tmp( c
);
1511 for( i
= 0; i
< 8; i
++ ) {
1512 tmp
[ i
] = alloc_tmp( c
);
1513 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1514 wtmp
[ i
] = brw_uw16_grf( tmp
[ i
].nr
, 0 );
1517 brw_set_access_mode( p
, BRW_ALIGN_1
);
1519 /* We only want 16 bits of precision from the integral part of each
1520 co-ordinate, but unfortunately the RNDD semantics would saturate
1521 at 16 bits if we performed the operation directly to a 16-bit
1522 destination. Therefore, we round to 32-bit temporaries where
1523 appropriate, and then store only the lower 16 bits. */
1524 brw_RNDD( p
, retype( floors
[ 0 ], BRW_REGISTER_TYPE_D
), param
[ 0 ] );
1525 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param
[ 1 ] );
1526 brw_RNDD( p
, retype( floors
[ 1 ], BRW_REGISTER_TYPE_D
), param
[ 2 ] );
1527 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param
[ 3 ] );
1528 brw_MOV( p
, high_words( floors
[ 0 ] ), low_words( itmp
[ 0 ] ) );
1529 brw_MOV( p
, high_words( floors
[ 1 ] ), low_words( itmp
[ 1 ] ) );
1531 /* Modify the flag register here, because the side effect is useful
1532 later (see below). We know for certain that all flags will be
1533 cleared, since the FRC instruction cannot possibly generate
1534 negative results. Even for exceptional inputs (infinities, denormals,
1535 NaNs), the architecture guarantees that the L conditional is false. */
1536 brw_set_conditionalmod( p
, BRW_CONDITIONAL_L
);
1537 brw_FRC( p
, param
[ 0 ], param
[ 0 ] );
1538 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1539 for( i
= 1; i
< 4; i
++ )
1540 brw_FRC( p
, param
[ i
], param
[ i
] );
1542 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1544 for( i
= 0; i
< 4; i
++ )
1545 brw_MUL( p
, interp
[ i
], param
[ i
], brw_imm_f( 6.0 ) );
1546 for( i
= 0; i
< 4; i
++ )
1547 brw_ADD( p
, interp
[ i
], interp
[ i
], brw_imm_f( -15.0 ) );
1548 for( i
= 0; i
< 4; i
++ )
1549 brw_MUL( p
, interp
[ i
], interp
[ i
], param
[ i
] );
1550 for( i
= 0; i
< 4; i
++ )
1551 brw_ADD( p
, interp
[ i
], interp
[ i
], brw_imm_f( 10.0 ) );
1552 for( j
= 0; j
< 3; j
++ )
1553 for( i
= 0; i
< 4; i
++ )
1554 brw_MUL( p
, interp
[ i
], interp
[ i
], param
[ i
] );
1556 /* Mark the current address, as it will be a jump destination. The
1557 following code will be executed twice: first, with the flag
1558 register clear indicating the w=0 case, and second with flags
1562 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1563 be hashed. Since we have only 16 bits of precision in the hash, we
1564 must be careful about thorough mixing to maintain entropy as we
1565 squash the input vector into a small scalar. */
1566 brw_MUL( p
, brw_null_reg(), low_words( floors
[ 0 ] ),
1567 brw_imm_uw( 0xBC8F ) );
1568 brw_MAC( p
, brw_null_reg(), high_words( floors
[ 0 ] ),
1569 brw_imm_uw( 0xD0BD ) );
1570 brw_MAC( p
, brw_null_reg(), low_words( floors
[ 1 ] ),
1571 brw_imm_uw( 0x9B93 ) );
1572 brw_MAC( p
, low_words( itmp
[ 0 ] ), high_words( floors
[ 1 ] ),
1573 brw_imm_uw( 0xA359 ) );
1574 brw_ADD( p
, high_words( itmp
[ 0 ] ), low_words( itmp
[ 0 ] ),
1575 brw_imm_uw( 0xBC8F ) );
1577 /* Temporarily disable the execution mask while we work with ExecSize=16
1578 channels (the mask is set for ExecSize=8 and is probably incorrect).
1579 Although this might cause execution of unwanted channels, the code
1580 writes only to temporary registers and has no side effects, so
1581 disabling the mask is harmless. */
1582 brw_push_insn_state( p
);
1583 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1584 brw_ADD( p
, wtmp
[ 1 ], wtmp
[ 0 ], brw_imm_uw( 0xD0BD ) );
1585 brw_ADD( p
, wtmp
[ 2 ], wtmp
[ 0 ], brw_imm_uw( 0x9B93 ) );
1586 brw_ADD( p
, wtmp
[ 3 ], wtmp
[ 1 ], brw_imm_uw( 0x9B93 ) );
1588 /* We're now ready to perform the hashing. The eight hashes are
1589 interleaved for performance. The hash function used is
1590 designed to rapidly achieve avalanche and require only 16x16
1591 bit multiplication, and 8-bit swizzles (which we get for
1593 for( i
= 0; i
< 4; i
++ )
1594 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0x28D9 ) );
1595 for( i
= 0; i
< 4; i
++ )
1596 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1597 odd_bytes( wtmp
[ i
] ) );
1598 for( i
= 0; i
< 4; i
++ )
1599 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0xC6D5 ) );
1600 for( i
= 0; i
< 4; i
++ )
1601 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1602 odd_bytes( wtmp
[ i
] ) );
1603 brw_pop_insn_state( p
);
1605 /* Now we want to initialise the four rear gradients based on the
1606 hashes. Format conversion from signed integer to float leaves
1607 everything scaled too high by a factor of pow( 2, 15 ), but
1608 we correct for that right at the end. */
1610 brw_ADD( p
, t
, param
[ 0 ], brw_imm_f( -1.0 ) );
1611 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1612 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1613 brw_MOV( p
, x1y0
, high_words( tmp
[ 0 ] ) );
1614 brw_MOV( p
, x1y1
, high_words( tmp
[ 1 ] ) );
1616 brw_push_insn_state( p
);
1617 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1618 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1619 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1620 brw_pop_insn_state( p
);
1622 brw_MUL( p
, x1y0
, x1y0
, t
);
1623 brw_MUL( p
, x1y1
, x1y1
, t
);
1624 brw_ADD( p
, t
, param
[ 1 ], brw_imm_f( -1.0 ) );
1625 brw_MUL( p
, x0y0
, x0y0
, param
[ 0 ] );
1626 brw_MUL( p
, x0y1
, x0y1
, param
[ 0 ] );
1629 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1630 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1631 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1632 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1634 brw_push_insn_state( p
);
1635 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1636 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1637 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1638 brw_pop_insn_state( p
);
1640 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1641 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1642 /* prepare t for the w component (used below): w the first time through
1643 the loop; w - 1 the second time) */
1644 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1645 brw_ADD( p
, t
, param
[ 3 ], brw_imm_f( -1.0 ) );
1646 p
->current
->header
.predicate_inverse
= 1;
1647 brw_MOV( p
, t
, param
[ 3 ] );
1648 p
->current
->header
.predicate_inverse
= 0;
1649 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1650 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 1 ] );
1651 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 1 ] );
1653 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1654 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1655 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1656 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1659 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1660 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1661 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1662 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1664 brw_push_insn_state( p
);
1665 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1666 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1667 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1668 brw_pop_insn_state( p
);
1670 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 2 ] );
1671 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], param
[ 2 ] );
1672 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 2 ] );
1673 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], param
[ 2 ] );
1675 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1676 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1677 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1678 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1681 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1682 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1683 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1684 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1686 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1687 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1688 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1689 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1690 brw_ADD( p
, t
, param
[ 0 ], brw_imm_f( -1.0 ) );
1692 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1693 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1694 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1695 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1697 /* Here we interpolate in the y dimension... */
1698 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1699 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1700 brw_MUL( p
, x0y1
, x0y1
, interp
[ 1 ] );
1701 brw_MUL( p
, x1y1
, x1y1
, interp
[ 1 ] );
1702 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1703 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1705 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1706 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1707 brw_MUL( p
, x1y0
, x1y0
, interp
[ 0 ] );
1708 brw_ADD( p
, tmp
[ 0 ], x0y0
, x1y0
);
1710 /* Now do the same thing for the front four gradients... */
1712 brw_MOV( p
, x0y0
, low_words( tmp
[ 2 ] ) );
1713 brw_MOV( p
, x0y1
, low_words( tmp
[ 3 ] ) );
1714 brw_MOV( p
, x1y0
, high_words( tmp
[ 2 ] ) );
1715 brw_MOV( p
, x1y1
, high_words( tmp
[ 3 ] ) );
1717 brw_push_insn_state( p
);
1718 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1719 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1720 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1721 brw_pop_insn_state( p
);
1723 brw_MUL( p
, x1y0
, x1y0
, t
);
1724 brw_MUL( p
, x1y1
, x1y1
, t
);
1725 brw_ADD( p
, t
, param
[ 1 ], brw_imm_f( -1.0 ) );
1726 brw_MUL( p
, x0y0
, x0y0
, param
[ 0 ] );
1727 brw_MUL( p
, x0y1
, x0y1
, param
[ 0 ] );
1730 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1731 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1732 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1733 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1735 brw_push_insn_state( p
);
1736 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1737 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1738 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1739 brw_pop_insn_state( p
);
1741 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1742 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1743 brw_ADD( p
, t
, param
[ 2 ], brw_imm_f( -1.0 ) );
1744 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 1 ] );
1745 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 1 ] );
1747 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1748 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1749 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1750 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1753 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1754 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1755 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1756 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1758 brw_push_insn_state( p
);
1759 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1760 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1761 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1762 brw_pop_insn_state( p
);
1764 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1765 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1766 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1767 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1768 /* prepare t for the w component (used below): w the first time through
1769 the loop; w - 1 the second time) */
1770 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1771 brw_ADD( p
, t
, param
[ 3 ], brw_imm_f( -1.0 ) );
1772 p
->current
->header
.predicate_inverse
= 1;
1773 brw_MOV( p
, t
, param
[ 3 ] );
1774 p
->current
->header
.predicate_inverse
= 0;
1775 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1777 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1778 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1779 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1780 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1783 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1784 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1785 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1786 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1788 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1789 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1790 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1791 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1793 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1794 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1795 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1796 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1798 /* Interpolate in the y dimension: */
1799 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1800 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1801 brw_MUL( p
, x0y1
, x0y1
, interp
[ 1 ] );
1802 brw_MUL( p
, x1y1
, x1y1
, interp
[ 1 ] );
1803 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1804 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1806 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1807 time put the front face in tmp[ 1 ] and we're nearly there... */
1808 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1809 brw_MUL( p
, x1y0
, x1y0
, interp
[ 0 ] );
1810 brw_ADD( p
, tmp
[ 1 ], x0y0
, x1y0
);
1812 /* Another interpolation, in the z dimension: */
1813 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], negate( tmp
[ 0 ] ) );
1814 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], interp
[ 2 ] );
1815 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], tmp
[ 1 ] );
1817 /* Exit the loop if we've computed both cubes... */
1818 origin
= p
->nr_insn
;
1819 brw_push_insn_state( p
);
1820 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1821 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1822 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1823 brw_pop_insn_state( p
);
1825 /* Save the result for the w=0 case, and increment the w coordinate: */
1826 brw_MOV( p
, w0
, tmp
[ 0 ] );
1827 brw_ADD( p
, high_words( floors
[ 1 ] ), high_words( floors
[ 1 ] ),
1830 /* Loop around for the other cube. Explicitly set the flag register
1831 (unfortunately we must spend an extra instruction to do this: we
1832 can't rely on a side effect of the previous MOV or ADD because
1833 conditional modifiers which are normally true might be false in
1834 exceptional circumstances, e.g. given a NaN input; the add to
1835 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1836 brw_push_insn_state( p
);
1837 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1838 brw_MOV( p
, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1839 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(),
1840 brw_imm_d( ( loop
- p
->nr_insn
) << 4 ) );
1841 brw_pop_insn_state( p
);
1843 /* Patch the previous conditional branch now that we know the
1844 destination address. */
1845 brw_set_src1( p
->store
+ origin
,
1846 brw_imm_d( ( p
->nr_insn
- origin
) << 4 ) );
1848 /* The very last interpolation. */
1849 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], negate( w0
) );
1850 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], interp
[ 3 ] );
1851 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], w0
);
1853 /* scale by pow( 2, -15 ), as described above */
1854 brw_MUL( p
, param
[ 0 ], tmp
[ 0 ], brw_imm_f( 0.000030517578125 ) );
1856 release_tmps( c
, mark
);
1859 static void emit_noise4( struct brw_wm_compile
*c
,
1860 const struct prog_instruction
*inst
)
1862 struct brw_compile
*p
= &c
->func
;
1863 struct brw_reg src0
, src1
, src2
, src3
, param0
, param1
, param2
, param3
, dst
;
1864 GLuint mask
= inst
->DstReg
.WriteMask
;
1866 int mark
= mark_tmps( c
);
1868 assert( mark
== 0 );
1870 src0
= get_src_reg( c
, inst
, 0, 0 );
1871 src1
= get_src_reg( c
, inst
, 0, 1 );
1872 src2
= get_src_reg( c
, inst
, 0, 2 );
1873 src3
= get_src_reg( c
, inst
, 0, 3 );
1875 param0
= alloc_tmp( c
);
1876 param1
= alloc_tmp( c
);
1877 param2
= alloc_tmp( c
);
1878 param3
= alloc_tmp( c
);
1880 brw_MOV( p
, param0
, src0
);
1881 brw_MOV( p
, param1
, src1
);
1882 brw_MOV( p
, param2
, src2
);
1883 brw_MOV( p
, param3
, src3
);
1885 invoke_subroutine( c
, SUB_NOISE4
, noise4_sub
);
1887 /* Fill in the result: */
1888 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1889 for (i
= 0 ; i
< 4; i
++) {
1890 if (mask
& (1<<i
)) {
1891 dst
= get_dst_reg(c
, inst
, i
);
1892 brw_MOV( p
, dst
, param0
);
1895 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1896 brw_set_saturate( p
, 0 );
1898 release_tmps( c
, mark
);
1903 BIAS on SIMD8 not working yet...
1905 static void emit_txb(struct brw_wm_compile
*c
,
1906 const struct prog_instruction
*inst
)
1908 struct brw_compile
*p
= &c
->func
;
1909 struct brw_reg dst
[4], src
[4], payload_reg
;
1910 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
1911 const GLuint unit
= inst
->TexSrcUnit
;
1915 assert(unit
< BRW_MAX_TEX_UNIT
);
1917 payload_reg
= get_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
, 0, 1, 0, 0);
1919 for (i
= 0; i
< 4; i
++)
1920 dst
[i
] = get_dst_reg(c
, inst
, i
);
1921 for (i
= 0; i
< 4; i
++)
1922 src
[i
] = get_src_reg(c
, inst
, 0, i
);
1924 switch (inst
->TexSrcTarget
) {
1925 case TEXTURE_1D_INDEX
:
1926 brw_MOV(p
, brw_message_reg(2), src
[0]); /* s coord */
1927 brw_MOV(p
, brw_message_reg(3), brw_imm_f(0)); /* t coord */
1928 brw_MOV(p
, brw_message_reg(4), brw_imm_f(0)); /* r coord */
1930 case TEXTURE_2D_INDEX
:
1931 case TEXTURE_RECT_INDEX
:
1932 brw_MOV(p
, brw_message_reg(2), src
[0]);
1933 brw_MOV(p
, brw_message_reg(3), src
[1]);
1934 brw_MOV(p
, brw_message_reg(4), brw_imm_f(0));
1936 case TEXTURE_3D_INDEX
:
1937 case TEXTURE_CUBE_INDEX
:
1938 brw_MOV(p
, brw_message_reg(2), src
[0]);
1939 brw_MOV(p
, brw_message_reg(3), src
[1]);
1940 brw_MOV(p
, brw_message_reg(4), src
[2]);
1943 /* invalid target */
1946 brw_MOV(p
, brw_message_reg(5), src
[3]); /* bias */
1947 brw_MOV(p
, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
1949 if (BRW_IS_IGDNG(p
->brw
)) {
1950 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG
;
1952 /* Does it work well on SIMD8? */
1953 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS
;
1957 retype(vec8(dst
[0]), BRW_REGISTER_TYPE_UW
), /* dest */
1959 retype(payload_reg
, BRW_REGISTER_TYPE_UW
), /* src0 */
1960 SURF_INDEX_TEXTURE(unit
),
1962 inst
->DstReg
.WriteMask
, /* writemask */
1963 msg_type
, /* msg_type */
1964 4, /* response_length */
1968 BRW_SAMPLER_SIMD_MODE_SIMD8
);
1972 static void emit_tex(struct brw_wm_compile
*c
,
1973 const struct prog_instruction
*inst
)
1975 struct brw_compile
*p
= &c
->func
;
1976 struct brw_reg dst
[4], src
[4], payload_reg
;
1977 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
1978 const GLuint unit
= inst
->TexSrcUnit
;
1982 GLboolean shadow
= (c
->key
.shadowtex_mask
& (1<<unit
)) ? 1 : 0;
1985 assert(unit
< BRW_MAX_TEX_UNIT
);
1987 payload_reg
= get_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
, 0, 1, 0, 0);
1989 for (i
= 0; i
< 4; i
++)
1990 dst
[i
] = get_dst_reg(c
, inst
, i
);
1991 for (i
= 0; i
< 4; i
++)
1992 src
[i
] = get_src_reg(c
, inst
, 0, i
);
1994 switch (inst
->TexSrcTarget
) {
1995 case TEXTURE_1D_INDEX
:
1999 case TEXTURE_2D_INDEX
:
2000 case TEXTURE_RECT_INDEX
:
2001 emit
= WRITEMASK_XY
;
2004 case TEXTURE_3D_INDEX
:
2005 case TEXTURE_CUBE_INDEX
:
2006 emit
= WRITEMASK_XYZ
;
2010 /* invalid target */
2015 /* move/load S, T, R coords */
2016 for (i
= 0; i
< nr
; i
++) {
2017 static const GLuint swz
[4] = {0,1,2,2};
2019 brw_MOV(p
, brw_message_reg(msg_len
+1), src
[swz
[i
]]);
2021 brw_MOV(p
, brw_message_reg(msg_len
+1), brw_imm_f(0));
2026 brw_MOV(p
, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2027 brw_MOV(p
, brw_message_reg(6), src
[2]); /* ref value / R coord */
2030 if (BRW_IS_IGDNG(p
->brw
)) {
2032 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG
;
2034 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG
;
2036 /* Does it work for shadow on SIMD8 ? */
2037 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE
;
2041 retype(vec8(dst
[0]), BRW_REGISTER_TYPE_UW
), /* dest */
2043 retype(payload_reg
, BRW_REGISTER_TYPE_UW
), /* src0 */
2044 SURF_INDEX_TEXTURE(unit
),
2046 inst
->DstReg
.WriteMask
, /* writemask */
2047 msg_type
, /* msg_type */
2048 4, /* response_length */
2049 shadow
? 6 : 4, /* msg_length */
2052 BRW_SAMPLER_SIMD_MODE_SIMD8
);
2055 brw_MOV(p
, dst
[3], brw_imm_f(1.0));
2060 * Resolve subroutine calls after code emit is done.
2062 static void post_wm_emit( struct brw_wm_compile
*c
)
2064 brw_resolve_cals(&c
->func
);
2068 get_argument_regs(struct brw_wm_compile
*c
,
2069 const struct prog_instruction
*inst
,
2071 struct brw_reg
*regs
,
2076 for (i
= 0; i
< 4; i
++) {
2077 if (mask
& (1 << i
))
2078 regs
[i
] = get_src_reg(c
, inst
, index
, i
);
2082 static void brw_wm_emit_glsl(struct brw_context
*brw
, struct brw_wm_compile
*c
)
2084 #define MAX_IF_DEPTH 32
2085 #define MAX_LOOP_DEPTH 32
2086 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
];
2087 GLuint i
, if_depth
= 0, loop_depth
= 0;
2088 struct brw_compile
*p
= &c
->func
;
2089 struct brw_indirect stack_index
= brw_indirect(0, 0);
2091 c
->out_of_regs
= GL_FALSE
;
2094 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
2095 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
2097 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
2098 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
2100 struct brw_reg args
[3][4], dst
[4];
2106 _mesa_printf("Inst %d: ", i
);
2107 _mesa_print_instruction(inst
);
2110 /* fetch any constants that this instruction needs */
2111 if (c
->fp
->use_const_buffer
)
2112 fetch_constants(c
, inst
);
2114 if (inst
->Opcode
!= OPCODE_ARL
) {
2115 for (j
= 0; j
< 4; j
++) {
2116 if (inst
->DstReg
.WriteMask
& (1 << j
))
2117 dst
[j
] = get_dst_reg(c
, inst
, j
);
2119 dst
[j
] = brw_null_reg();
2122 for (j
= 0; j
< brw_wm_nr_args(inst
->Opcode
); j
++)
2123 get_argument_regs(c
, inst
, j
, args
[j
], WRITEMASK_XYZW
);
2125 dst_flags
= inst
->DstReg
.WriteMask
;
2126 if (inst
->SaturateMode
== SATURATE_ZERO_ONE
)
2127 dst_flags
|= SATURATE
;
2129 if (inst
->CondUpdate
)
2130 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NZ
);
2132 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
2134 dst_flags
= inst
->DstReg
.WriteMask
;
2135 if (inst
->SaturateMode
== SATURATE_ZERO_ONE
)
2136 dst_flags
|= SATURATE
;
2138 switch (inst
->Opcode
) {
2140 emit_pixel_xy(c
, dst
, dst_flags
);
2143 emit_delta_xy(p
, dst
, dst_flags
, args
[0]);
2146 emit_pixel_w(c
, dst
, dst_flags
, args
[0], args
[1]);
2149 emit_linterp(p
, dst
, dst_flags
, args
[0], args
[1]);
2152 emit_pinterp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
2155 emit_cinterp(p
, dst
, dst_flags
, args
[0]);
2158 emit_wpos_xy(c
, dst
, dst_flags
, args
[0]);
2161 emit_fb_write(c
, inst
);
2163 case WM_FRONTFACING
:
2164 emit_frontfacing(p
, dst
, dst_flags
);
2167 emit_alu2(p
, brw_ADD
, dst
, dst_flags
, args
[0], args
[1]);
2173 emit_alu1(p
, brw_FRC
, dst
, dst_flags
, args
[0]);
2176 emit_alu1(p
, brw_RNDD
, dst
, dst_flags
, args
[0]);
2179 unalias3(c
, emit_lrp
,
2180 dst
, dst_flags
, args
[0], args
[1], args
[2]);
2183 emit_alu1(p
, brw_RNDZ
, dst
, dst_flags
, args
[0]);
2187 emit_alu1(p
, brw_MOV
, dst
, dst_flags
, args
[0]);
2190 emit_dp3(p
, dst
, dst_flags
, args
[0], args
[1]);
2193 emit_dp4(p
, dst
, dst_flags
, args
[0], args
[1]);
2196 emit_xpd(p
, dst
, dst_flags
, args
[0], args
[1]);
2199 emit_dph(p
, dst
, dst_flags
, args
[0], args
[1]);
2202 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, dst_flags
, args
[0]);
2205 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, dst_flags
, args
[0]);
2208 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, dst_flags
, args
[0]);
2211 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, dst_flags
, args
[0]);
2214 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, dst_flags
, args
[0]);
2217 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, dst_flags
, args
[0]);
2221 emit_min_max(c
, inst
);
2225 emit_ddxy(p
, dst
, dst_flags
, (inst
->Opcode
== OPCODE_DDX
),
2229 emit_sop(p
, dst
, dst_flags
,
2230 BRW_CONDITIONAL_L
, args
[0], args
[1]);
2233 emit_sop(p
, dst
, dst_flags
,
2234 BRW_CONDITIONAL_LE
, args
[0], args
[1]);
2237 emit_sop(p
, dst
, dst_flags
,
2238 BRW_CONDITIONAL_G
, args
[0], args
[1]);
2241 emit_sop(p
, dst
, dst_flags
,
2242 BRW_CONDITIONAL_GE
, args
[0], args
[1]);
2245 emit_sop(p
, dst
, dst_flags
,
2246 BRW_CONDITIONAL_EQ
, args
[0], args
[1]);
2249 emit_sop(p
, dst
, dst_flags
,
2250 BRW_CONDITIONAL_NEQ
, args
[0], args
[1]);
2253 emit_alu2(p
, brw_MUL
, dst
, dst_flags
, args
[0], args
[1]);
2256 emit_math2(c
, BRW_MATH_FUNCTION_POW
,
2257 dst
, dst_flags
, args
[0], args
[1]);
2260 emit_mad(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
2263 emit_noise1(c
, inst
);
2266 emit_noise2(c
, inst
);
2269 emit_noise3(c
, inst
);
2272 emit_noise4(c
, inst
);
2284 assert(if_depth
< MAX_IF_DEPTH
);
2285 if_inst
[if_depth
++] = brw_IF(p
, BRW_EXECUTE_8
);
2288 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
2291 assert(if_depth
> 0);
2292 brw_ENDIF(p
, if_inst
[--if_depth
]);
2295 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2301 brw_push_insn_state(p
);
2302 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
2303 brw_set_access_mode(p
, BRW_ALIGN_1
);
2304 brw_ADD(p
, deref_1ud(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2305 brw_set_access_mode(p
, BRW_ALIGN_16
);
2306 brw_ADD(p
, get_addr_reg(stack_index
),
2307 get_addr_reg(stack_index
), brw_imm_d(4));
2308 brw_save_call(&c
->func
, inst
->Comment
, p
->nr_insn
);
2309 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2310 brw_pop_insn_state(p
);
2314 brw_push_insn_state(p
);
2315 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
2316 brw_ADD(p
, get_addr_reg(stack_index
),
2317 get_addr_reg(stack_index
), brw_imm_d(-4));
2318 brw_set_access_mode(p
, BRW_ALIGN_1
);
2319 brw_MOV(p
, brw_ip_reg(), deref_1ud(stack_index
, 0));
2320 brw_set_access_mode(p
, BRW_ALIGN_16
);
2321 brw_pop_insn_state(p
);
2324 case OPCODE_BGNLOOP
:
2325 /* XXX may need to invalidate the current_constant regs */
2326 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2330 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2334 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2336 case OPCODE_ENDLOOP
:
2338 struct brw_instruction
*inst0
, *inst1
;
2341 if (BRW_IS_IGDNG(brw
))
2345 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2346 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2347 while (inst0
> loop_inst
[loop_depth
]) {
2349 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
) {
2350 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2351 inst0
->bits3
.if_else
.pop_count
= 0;
2353 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
) {
2354 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2355 inst0
->bits3
.if_else
.pop_count
= 0;
2361 _mesa_printf("unsupported IR in fragment shader %d\n",
2365 if (inst
->CondUpdate
)
2366 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
2368 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2372 if (INTEL_DEBUG
& DEBUG_WM
) {
2373 _mesa_printf("wm-native:\n");
2374 for (i
= 0; i
< p
->nr_insn
; i
++)
2375 brw_disasm(stderr
, &p
->store
[i
]);
2381 * Do GPU code generation for shaders that use GLSL features such as
2382 * flow control. Other shaders will be compiled with the
2384 void brw_wm_glsl_emit(struct brw_context
*brw
, struct brw_wm_compile
*c
)
2386 if (INTEL_DEBUG
& DEBUG_WM
) {
2387 _mesa_printf("brw_wm_glsl_emit:\n");
2390 /* initial instruction translation/simplification */
2393 /* actual code generation */
2394 brw_wm_emit_glsl(brw
, c
);
2396 if (INTEL_DEBUG
& DEBUG_WM
) {
2397 brw_wm_print_program(c
, "brw_wm_glsl_emit done");
2400 c
->prog_data
.total_grf
= num_grf_used(c
);
2401 c
->prog_data
.total_scratch
= 0;