1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
10 SUB_NOISE1
, SUB_NOISE2
, SUB_NOISE3
, SUB_NOISE4
13 static struct brw_reg
get_dst_reg(struct brw_wm_compile
*c
,
14 const struct prog_instruction
*inst
,
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
22 GLboolean
brw_wm_is_glsl(const struct gl_fragment_program
*fp
)
26 if (INTEL_DEBUG
& DEBUG_GLSL_FORCE
)
29 for (i
= 0; i
< fp
->Base
.NumInstructions
; i
++) {
30 const struct prog_instruction
*inst
= &fp
->Base
.Instructions
[i
];
31 switch (inst
->Opcode
) {
54 reclaim_temps(struct brw_wm_compile
*c
);
57 /** Mark GRF register as used. */
59 prealloc_grf(struct brw_wm_compile
*c
, int r
)
61 c
->used_grf
[r
] = GL_TRUE
;
65 /** Mark given GRF register as not in use. */
67 release_grf(struct brw_wm_compile
*c
, int r
)
69 /*assert(c->used_grf[r]);*/
70 c
->used_grf
[r
] = GL_FALSE
;
71 c
->first_free_grf
= MIN2(c
->first_free_grf
, r
);
75 /** Return index of a free GRF, mark it as used. */
77 alloc_grf(struct brw_wm_compile
*c
)
80 for (r
= c
->first_free_grf
; r
< BRW_WM_MAX_GRF
; r
++) {
81 if (!c
->used_grf
[r
]) {
82 c
->used_grf
[r
] = GL_TRUE
;
83 c
->first_free_grf
= r
+ 1; /* a guess */
88 /* no free temps, try to reclaim some */
90 c
->first_free_grf
= 0;
93 for (r
= c
->first_free_grf
; r
< BRW_WM_MAX_GRF
; r
++) {
94 if (!c
->used_grf
[r
]) {
95 c
->used_grf
[r
] = GL_TRUE
;
96 c
->first_free_grf
= r
+ 1; /* a guess */
101 for (r
= 0; r
< BRW_WM_MAX_GRF
; r
++) {
102 assert(c
->used_grf
[r
]);
105 /* really, no free GRF regs found */
106 if (!c
->out_of_regs
) {
107 /* print warning once per compilation */
108 _mesa_warning(NULL
, "i965: ran out of registers for fragment program");
109 c
->out_of_regs
= GL_TRUE
;
116 /** Return number of GRF registers used */
118 num_grf_used(const struct brw_wm_compile
*c
)
121 for (r
= BRW_WM_MAX_GRF
- 1; r
>= 0; r
--)
130 * Record the mapping of a Mesa register to a hardware register.
132 static void set_reg(struct brw_wm_compile
*c
, int file
, int index
,
133 int component
, struct brw_reg reg
)
135 c
->wm_regs
[file
][index
][component
].reg
= reg
;
136 c
->wm_regs
[file
][index
][component
].inited
= GL_TRUE
;
139 static struct brw_reg
alloc_tmp(struct brw_wm_compile
*c
)
143 /* if we need to allocate another temp, grow the tmp_regs[] array */
144 if (c
->tmp_index
== c
->tmp_max
) {
145 int r
= alloc_grf(c
);
147 /*printf("Out of temps in %s\n", __FUNCTION__);*/
148 r
= 50; /* XXX random register! */
150 c
->tmp_regs
[ c
->tmp_max
++ ] = r
;
153 /* form the GRF register */
154 reg
= brw_vec8_grf(c
->tmp_regs
[ c
->tmp_index
++ ], 0);
155 /*printf("alloc_temp %d\n", reg.nr);*/
156 assert(reg
.nr
< BRW_WM_MAX_GRF
);
162 * Save current temp register info.
163 * There must be a matching call to release_tmps().
165 static int mark_tmps(struct brw_wm_compile
*c
)
170 static struct brw_reg
lookup_tmp( struct brw_wm_compile
*c
, int index
)
172 return brw_vec8_grf( c
->tmp_regs
[ index
], 0 );
175 static void release_tmps(struct brw_wm_compile
*c
, int mark
)
181 * Convert Mesa src register to brw register.
183 * Since we're running in SOA mode each Mesa register corresponds to four
184 * hardware registers. We allocate the hardware registers as needed here.
186 * \param file register file, one of PROGRAM_x
187 * \param index register number
188 * \param component src component (X=0, Y=1, Z=2, W=3)
189 * \param nr not used?!?
190 * \param neg negate value?
191 * \param abs take absolute value?
193 static struct brw_reg
194 get_reg(struct brw_wm_compile
*c
, int file
, int index
, int component
,
195 int nr
, GLuint neg
, GLuint abs
)
199 case PROGRAM_STATE_VAR
:
200 case PROGRAM_CONSTANT
:
201 case PROGRAM_UNIFORM
:
202 file
= PROGRAM_STATE_VAR
;
204 case PROGRAM_UNDEFINED
:
205 return brw_null_reg();
206 case PROGRAM_TEMPORARY
:
209 case PROGRAM_PAYLOAD
:
212 _mesa_problem(NULL
, "Unexpected file in get_reg()");
213 return brw_null_reg();
217 assert(component
< 4);
219 /* see if we've already allocated a HW register for this Mesa register */
220 if (c
->wm_regs
[file
][index
][component
].inited
) {
222 reg
= c
->wm_regs
[file
][index
][component
].reg
;
225 /* no, allocate new register */
226 int grf
= alloc_grf(c
);
227 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
229 /* totally out of temps */
230 grf
= 51; /* XXX random register! */
233 reg
= brw_vec8_grf(grf
, 0);
234 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
236 set_reg(c
, file
, index
, component
, reg
);
239 if (neg
& (1 << component
)) {
250 * This is called if we run out of GRF registers. Examine the live intervals
251 * of temp regs in the program and free those which won't be used again.
254 reclaim_temps(struct brw_wm_compile
*c
)
256 GLint intBegin
[MAX_PROGRAM_TEMPS
];
257 GLint intEnd
[MAX_PROGRAM_TEMPS
];
260 /*printf("Reclaim temps:\n");*/
262 _mesa_find_temp_intervals(c
->prog_instructions
, c
->nr_fp_insns
,
265 for (index
= 0; index
< MAX_PROGRAM_TEMPS
; index
++) {
266 if (intEnd
[index
] != -1 && intEnd
[index
] < c
->cur_inst
) {
267 /* program temp[i] can be freed */
269 /*printf(" temp[%d] is dead\n", index);*/
270 for (component
= 0; component
< 4; component
++) {
271 if (c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].inited
) {
272 int r
= c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].reg
.nr
;
275 printf(" Reclaim temp %d, reg %d at inst %d\n",
276 index, r, c->cur_inst);
278 c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].inited
= GL_FALSE
;
289 * Preallocate registers. This sets up the Mesa to hardware register
290 * mapping for certain registers, such as constants (uniforms/state vars)
293 static void prealloc_reg(struct brw_wm_compile
*c
)
295 struct intel_context
*intel
= &c
->func
.brw
->intel
;
298 int urb_read_length
= 0;
299 GLuint inputs
= FRAG_BIT_WPOS
| c
->fp_interp_emitted
;
300 GLuint reg_index
= 0;
302 memset(c
->used_grf
, GL_FALSE
, sizeof(c
->used_grf
));
303 c
->first_free_grf
= 0;
305 for (i
= 0; i
< 4; i
++) {
306 if (i
< c
->key
.nr_depth_regs
)
307 reg
= brw_vec8_grf(i
* 2, 0);
309 reg
= brw_vec8_grf(0, 0);
310 set_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
, i
, reg
);
312 reg_index
+= 2 * c
->key
.nr_depth_regs
;
316 const GLuint nr_params
= c
->fp
->program
.Base
.Parameters
->NumParameters
;
317 const GLuint nr_temps
= c
->fp
->program
.Base
.NumTemporaries
;
319 /* use a real constant buffer, or just use a section of the GRF? */
320 /* XXX this heuristic may need adjustment... */
321 if ((nr_params
+ nr_temps
) * 4 + reg_index
> 80)
322 c
->fp
->use_const_buffer
= GL_TRUE
;
324 c
->fp
->use_const_buffer
= GL_FALSE
;
325 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
327 if (c
->fp
->use_const_buffer
) {
328 /* We'll use a real constant buffer and fetch constants from
329 * it with a dataport read message.
332 /* number of float constants in CURBE */
333 c
->prog_data
.nr_params
= 0;
336 const struct gl_program_parameter_list
*plist
=
337 c
->fp
->program
.Base
.Parameters
;
340 /* number of float constants in CURBE */
341 c
->prog_data
.nr_params
= 4 * nr_params
;
343 /* loop over program constants (float[4]) */
344 for (i
= 0; i
< nr_params
; i
++) {
345 /* loop over XYZW channels */
346 for (j
= 0; j
< 4; j
++, index
++) {
347 reg
= brw_vec1_grf(reg_index
+ index
/ 8, index
% 8);
348 /* Save pointer to parameter/constant value.
349 * Constants will be copied in prepare_constant_buffer()
351 c
->prog_data
.param
[index
] = &plist
->ParameterValues
[i
][j
];
352 set_reg(c
, PROGRAM_STATE_VAR
, i
, j
, reg
);
355 /* number of constant regs used (each reg is float[8]) */
356 c
->nr_creg
= 2 * ((4 * nr_params
+ 15) / 16);
357 reg_index
+= c
->nr_creg
;
361 /* fragment shader inputs */
362 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
365 if (i
>= VERT_RESULT_VAR0
)
366 fp_input
= i
- VERT_RESULT_VAR0
+ FRAG_ATTRIB_VAR0
;
367 else if (i
<= VERT_RESULT_TEX7
)
372 if (fp_input
>= 0 && inputs
& (1 << fp_input
)) {
373 urb_read_length
= reg_index
;
374 reg
= brw_vec8_grf(reg_index
, 0);
375 for (j
= 0; j
< 4; j
++)
376 set_reg(c
, PROGRAM_PAYLOAD
, fp_input
, j
, reg
);
378 if (c
->key
.vp_outputs_written
& BITFIELD64_BIT(i
)) {
383 c
->prog_data
.first_curbe_grf
= c
->key
.nr_depth_regs
* 2;
384 c
->prog_data
.urb_read_length
= urb_read_length
;
385 c
->prog_data
.curb_read_length
= c
->nr_creg
;
386 c
->emit_mask_reg
= brw_uw1_reg(BRW_GENERAL_REGISTER_FILE
, reg_index
, 0);
388 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg_index
, 0);
391 /* mark GRF regs [0..reg_index-1] as in-use */
392 for (i
= 0; i
< reg_index
; i
++)
395 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
396 prealloc_grf(c
, 126);
397 prealloc_grf(c
, 127);
399 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
400 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
401 struct brw_reg dst
[4];
403 switch (inst
->Opcode
) {
406 /* Allocate the channels of texture results contiguously,
407 * since they are written out that way by the sampler unit.
409 for (j
= 0; j
< 4; j
++) {
410 dst
[j
] = get_dst_reg(c
, inst
, j
);
412 assert(dst
[j
].nr
== dst
[j
- 1].nr
+ 1);
420 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
421 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
423 switch (inst
->Opcode
) {
425 /* Allocate WM_DELTAXY destination on G45/GM45 to an
426 * even-numbered GRF if possible so that we can use the PLN
429 if (inst
->DstReg
.WriteMask
== WRITEMASK_XY
&&
430 !c
->wm_regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
][0].inited
&&
431 !c
->wm_regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
][1].inited
&&
432 (IS_G4X(intel
->intelScreen
->deviceID
) || intel
->gen
== 5)) {
435 for (grf
= c
->first_free_grf
& ~1;
436 grf
< BRW_WM_MAX_GRF
;
439 if (!c
->used_grf
[grf
] && !c
->used_grf
[grf
+ 1]) {
440 c
->used_grf
[grf
] = GL_TRUE
;
441 c
->used_grf
[grf
+ 1] = GL_TRUE
;
442 c
->first_free_grf
= grf
+ 2; /* a guess */
444 set_reg(c
, inst
->DstReg
.File
, inst
->DstReg
.Index
, 0,
445 brw_vec8_grf(grf
, 0));
446 set_reg(c
, inst
->DstReg
.File
, inst
->DstReg
.Index
, 1,
447 brw_vec8_grf(grf
+ 1, 0));
457 /* An instruction may reference up to three constants.
458 * They'll be found in these registers.
459 * XXX alloc these on demand!
461 if (c
->fp
->use_const_buffer
) {
462 for (i
= 0; i
< 3; i
++) {
463 c
->current_const
[i
].index
= -1;
464 c
->current_const
[i
].reg
= brw_vec8_grf(alloc_grf(c
), 0);
468 printf("USE CONST BUFFER? %d\n", c
->fp
->use_const_buffer
);
469 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index
);
475 * Check if any of the instruction's src registers are constants, uniforms,
476 * or statevars. If so, fetch any constants that we don't already have in
477 * the three GRF slots.
479 static void fetch_constants(struct brw_wm_compile
*c
,
480 const struct prog_instruction
*inst
)
482 struct brw_compile
*p
= &c
->func
;
485 /* loop over instruction src regs */
486 for (i
= 0; i
< 3; i
++) {
487 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
488 if (src
->File
== PROGRAM_STATE_VAR
||
489 src
->File
== PROGRAM_CONSTANT
||
490 src
->File
== PROGRAM_UNIFORM
) {
491 c
->current_const
[i
].index
= src
->Index
;
494 printf(" fetch const[%d] for arg %d into reg %d\n",
495 src
->Index
, i
, c
->current_const
[i
].reg
.nr
);
498 /* need to fetch the constant now */
500 c
->current_const
[i
].reg
, /* writeback dest */
501 src
->RelAddr
, /* relative indexing? */
502 16 * src
->Index
, /* byte offset */
503 SURF_INDEX_FRAG_CONST_BUFFER
/* binding table index */
511 * Convert Mesa dst register to brw register.
513 static struct brw_reg
get_dst_reg(struct brw_wm_compile
*c
,
514 const struct prog_instruction
*inst
,
518 return get_reg(c
, inst
->DstReg
.File
, inst
->DstReg
.Index
, component
, nr
,
523 static struct brw_reg
524 get_src_reg_const(struct brw_wm_compile
*c
,
525 const struct prog_instruction
*inst
,
526 GLuint srcRegIndex
, GLuint component
)
528 /* We should have already fetched the constant from the constant
529 * buffer in fetch_constants(). Now we just have to return a
530 * register description that extracts the needed component and
531 * smears it across all eight vector components.
533 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
534 struct brw_reg const_reg
;
536 assert(component
< 4);
537 assert(srcRegIndex
< 3);
538 assert(c
->current_const
[srcRegIndex
].index
!= -1);
539 const_reg
= c
->current_const
[srcRegIndex
].reg
;
541 /* extract desired float from the const_reg, and smear */
542 const_reg
= stride(const_reg
, 0, 1, 0);
543 const_reg
.subnr
= component
* 4;
545 if (src
->Negate
& (1 << component
))
546 const_reg
= negate(const_reg
);
548 const_reg
= brw_abs(const_reg
);
551 printf(" form const[%d].%d for arg %d, reg %d\n",
552 c
->current_const
[srcRegIndex
].index
,
563 * Convert Mesa src register to brw register.
565 static struct brw_reg
get_src_reg(struct brw_wm_compile
*c
,
566 const struct prog_instruction
*inst
,
567 GLuint srcRegIndex
, GLuint channel
)
569 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
571 const GLuint component
= GET_SWZ(src
->Swizzle
, channel
);
573 /* Extended swizzle terms */
574 if (component
== SWIZZLE_ZERO
) {
575 return brw_imm_f(0.0F
);
577 else if (component
== SWIZZLE_ONE
) {
578 return brw_imm_f(1.0F
);
581 if (c
->fp
->use_const_buffer
&&
582 (src
->File
== PROGRAM_STATE_VAR
||
583 src
->File
== PROGRAM_CONSTANT
||
584 src
->File
== PROGRAM_UNIFORM
)) {
585 return get_src_reg_const(c
, inst
, srcRegIndex
, component
);
588 /* other type of source register */
589 return get_reg(c
, src
->File
, src
->Index
, component
, nr
,
590 src
->Negate
, src
->Abs
);
595 * Subroutines are minimal support for resusable instruction sequences.
596 * They are implemented as simply as possible to minimise overhead: there
597 * is no explicit support for communication between the caller and callee
598 * other than saving the return address in a temporary register, nor is
599 * there any automatic local storage. This implies that great care is
600 * required before attempting reentrancy or any kind of nested
601 * subroutine invocations.
603 static void invoke_subroutine( struct brw_wm_compile
*c
,
604 enum _subroutine subroutine
,
605 void (*emit
)( struct brw_wm_compile
* ) )
607 struct brw_compile
*p
= &c
->func
;
609 assert( subroutine
< BRW_WM_MAX_SUBROUTINE
);
611 if( c
->subroutines
[ subroutine
] ) {
612 /* subroutine previously emitted: reuse existing instructions */
614 int mark
= mark_tmps( c
);
615 struct brw_reg return_address
= retype( alloc_tmp( c
),
616 BRW_REGISTER_TYPE_UD
);
617 int here
= p
->nr_insn
;
619 brw_push_insn_state(p
);
620 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
621 brw_ADD( p
, return_address
, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
623 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(),
624 brw_imm_d( ( c
->subroutines
[ subroutine
] -
626 brw_pop_insn_state(p
);
628 release_tmps( c
, mark
);
630 /* previously unused subroutine: emit, and mark for later reuse */
632 int mark
= mark_tmps( c
);
633 struct brw_reg return_address
= retype( alloc_tmp( c
),
634 BRW_REGISTER_TYPE_UD
);
635 struct brw_instruction
*calc
;
636 int base
= p
->nr_insn
;
638 brw_push_insn_state(p
);
639 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
640 calc
= brw_ADD( p
, return_address
, brw_ip_reg(), brw_imm_ud( 0 ) );
641 brw_pop_insn_state(p
);
643 c
->subroutines
[ subroutine
] = p
->nr_insn
;
647 brw_push_insn_state(p
);
648 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
649 brw_MOV( p
, brw_ip_reg(), return_address
);
650 brw_pop_insn_state(p
);
652 brw_set_src1( calc
, brw_imm_ud( ( p
->nr_insn
- base
) << 4 ) );
654 release_tmps( c
, mark
);
658 static void emit_arl(struct brw_wm_compile
*c
,
659 const struct prog_instruction
*inst
)
661 struct brw_compile
*p
= &c
->func
;
662 struct brw_reg src0
, addr_reg
;
663 brw_set_saturate(p
, (inst
->SaturateMode
!= SATURATE_OFF
) ? 1 : 0);
664 addr_reg
= brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
666 src0
= get_src_reg(c
, inst
, 0, 0); /* channel 0 */
667 brw_MOV(p
, addr_reg
, src0
);
668 brw_set_saturate(p
, 0);
672 * For GLSL shaders, this KIL will be unconditional.
673 * It may be contained inside an IF/ENDIF structure of course.
675 static void emit_kil(struct brw_wm_compile
*c
)
677 struct brw_compile
*p
= &c
->func
;
678 struct brw_reg depth
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
679 brw_push_insn_state(p
);
680 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
681 brw_NOT(p
, c
->emit_mask_reg
, brw_mask_reg(1)); /* IMASK */
682 brw_AND(p
, depth
, c
->emit_mask_reg
, depth
);
683 brw_pop_insn_state(p
);
686 static INLINE
struct brw_reg
high_words( struct brw_reg reg
)
688 return stride( suboffset( retype( reg
, BRW_REGISTER_TYPE_W
), 1 ),
692 static INLINE
struct brw_reg
low_words( struct brw_reg reg
)
694 return stride( retype( reg
, BRW_REGISTER_TYPE_W
), 0, 8, 2 );
697 static INLINE
struct brw_reg
even_bytes( struct brw_reg reg
)
699 return stride( retype( reg
, BRW_REGISTER_TYPE_B
), 0, 16, 2 );
702 static INLINE
struct brw_reg
odd_bytes( struct brw_reg reg
)
704 return stride( suboffset( retype( reg
, BRW_REGISTER_TYPE_B
), 1 ),
708 /* One-, two- and three-dimensional Perlin noise, similar to the description
709 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
710 static void noise1_sub( struct brw_wm_compile
*c
) {
712 struct brw_compile
*p
= &c
->func
;
713 struct brw_reg param
,
714 x0
, x1
, /* gradients at each end */
715 t
, tmp
[ 2 ], /* float temporaries */
716 itmp
[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
718 int mark
= mark_tmps( c
);
723 tmp
[ 0 ] = alloc_tmp( c
);
724 tmp
[ 1 ] = alloc_tmp( c
);
725 itmp
[ 0 ] = retype( tmp
[ 0 ], BRW_REGISTER_TYPE_UD
);
726 itmp
[ 1 ] = retype( tmp
[ 1 ], BRW_REGISTER_TYPE_UD
);
727 itmp
[ 2 ] = retype( x0
, BRW_REGISTER_TYPE_UD
);
728 itmp
[ 3 ] = retype( x1
, BRW_REGISTER_TYPE_UD
);
729 itmp
[ 4 ] = retype( t
, BRW_REGISTER_TYPE_UD
);
731 param
= lookup_tmp( c
, mark
- 2 );
733 brw_set_access_mode( p
, BRW_ALIGN_1
);
735 brw_MOV( p
, itmp
[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
737 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
738 be hashed. Also compute the remainder (offset within the unit
739 length), interleaved to reduce register dependency penalties. */
740 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param
);
741 brw_FRC( p
, param
, param
);
742 brw_ADD( p
, itmp
[ 1 ], itmp
[ 0 ], brw_imm_ud( 1 ) );
743 brw_MOV( p
, itmp
[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
744 brw_MOV( p
, itmp
[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
746 /* We're now ready to perform the hashing. The two hashes are
747 interleaved for performance. The hash function used is
748 designed to rapidly achieve avalanche and require only 32x16
749 bit multiplication, and 16-bit swizzles (which we get for
750 free). We can't use immediate operands in the multiplies,
751 because immediates are permitted only in src1 and the 16-bit
752 factor is permitted only in src0. */
753 for( i
= 0; i
< 2; i
++ )
754 brw_MUL( p
, itmp
[ i
], itmp
[ 2 ], itmp
[ i
] );
755 for( i
= 0; i
< 2; i
++ )
756 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
757 high_words( itmp
[ i
] ) );
758 for( i
= 0; i
< 2; i
++ )
759 brw_MUL( p
, itmp
[ i
], itmp
[ 3 ], itmp
[ i
] );
760 for( i
= 0; i
< 2; i
++ )
761 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
762 high_words( itmp
[ i
] ) );
763 for( i
= 0; i
< 2; i
++ )
764 brw_MUL( p
, itmp
[ i
], itmp
[ 4 ], itmp
[ i
] );
765 for( i
= 0; i
< 2; i
++ )
766 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
767 high_words( itmp
[ i
] ) );
769 /* Now we want to initialise the two gradients based on the
770 hashes. Format conversion from signed integer to float leaves
771 everything scaled too high by a factor of pow( 2, 31 ), but
772 we correct for that right at the end. */
773 brw_ADD( p
, t
, param
, brw_imm_f( -1.0 ) );
774 brw_MOV( p
, x0
, retype( tmp
[ 0 ], BRW_REGISTER_TYPE_D
) );
775 brw_MOV( p
, x1
, retype( tmp
[ 1 ], BRW_REGISTER_TYPE_D
) );
777 brw_MUL( p
, x0
, x0
, param
);
778 brw_MUL( p
, x1
, x1
, t
);
780 /* We interpolate between the gradients using the polynomial
781 6t^5 - 15t^4 + 10t^3 (Perlin). */
782 brw_MUL( p
, tmp
[ 0 ], param
, brw_imm_f( 6.0 ) );
783 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( -15.0 ) );
784 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
785 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( 10.0 ) );
786 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
787 brw_ADD( p
, x1
, x1
, negate( x0
) ); /* unrelated work to fill the
789 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
790 brw_MUL( p
, param
, tmp
[ 0 ], param
);
791 brw_MUL( p
, x1
, x1
, param
);
792 brw_ADD( p
, x0
, x0
, x1
);
793 /* scale by pow( 2, -30 ), to compensate for the format conversion
794 above and an extra factor of 2 so that a single gradient covers
796 brw_MUL( p
, param
, x0
, brw_imm_f( 0.000000000931322574615478515625 ) );
798 release_tmps( c
, mark
);
801 static void emit_noise1( struct brw_wm_compile
*c
,
802 const struct prog_instruction
*inst
)
804 struct brw_compile
*p
= &c
->func
;
805 struct brw_reg src
, param
, dst
;
806 GLuint mask
= inst
->DstReg
.WriteMask
;
808 int mark
= mark_tmps( c
);
812 src
= get_src_reg( c
, inst
, 0, 0 );
814 param
= alloc_tmp( c
);
816 brw_MOV( p
, param
, src
);
818 invoke_subroutine( c
, SUB_NOISE1
, noise1_sub
);
820 /* Fill in the result: */
821 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
822 for (i
= 0 ; i
< 4; i
++) {
824 dst
= get_dst_reg(c
, inst
, i
);
825 brw_MOV( p
, dst
, param
);
828 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
829 brw_set_saturate( p
, 0 );
831 release_tmps( c
, mark
);
834 static void noise2_sub( struct brw_wm_compile
*c
) {
836 struct brw_compile
*p
= &c
->func
;
837 struct brw_reg param0
, param1
,
838 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at each corner */
839 t
, tmp
[ 4 ], /* float temporaries */
840 itmp
[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
842 int mark
= mark_tmps( c
);
844 x0y0
= alloc_tmp( c
);
845 x0y1
= alloc_tmp( c
);
846 x1y0
= alloc_tmp( c
);
847 x1y1
= alloc_tmp( c
);
849 for( i
= 0; i
< 4; i
++ ) {
850 tmp
[ i
] = alloc_tmp( c
);
851 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
853 itmp
[ 4 ] = retype( x0y0
, BRW_REGISTER_TYPE_UD
);
854 itmp
[ 5 ] = retype( x0y1
, BRW_REGISTER_TYPE_UD
);
855 itmp
[ 6 ] = retype( x1y0
, BRW_REGISTER_TYPE_UD
);
857 param0
= lookup_tmp( c
, mark
- 3 );
858 param1
= lookup_tmp( c
, mark
- 2 );
860 brw_set_access_mode( p
, BRW_ALIGN_1
);
862 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
863 be hashed. Also compute the remainders (offsets within the unit
864 square), interleaved to reduce register dependency penalties. */
865 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param0
);
866 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param1
);
867 brw_FRC( p
, param0
, param0
);
868 brw_FRC( p
, param1
, param1
);
869 brw_MOV( p
, itmp
[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
870 brw_ADD( p
, high_words( itmp
[ 0 ] ), high_words( itmp
[ 0 ] ),
871 low_words( itmp
[ 1 ] ) );
872 brw_MOV( p
, itmp
[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
873 brw_MOV( p
, itmp
[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
874 brw_ADD( p
, itmp
[ 1 ], itmp
[ 0 ], brw_imm_ud( 0x10000 ) );
875 brw_ADD( p
, itmp
[ 2 ], itmp
[ 0 ], brw_imm_ud( 0x1 ) );
876 brw_ADD( p
, itmp
[ 3 ], itmp
[ 0 ], brw_imm_ud( 0x10001 ) );
878 /* We're now ready to perform the hashing. The four hashes are
879 interleaved for performance. The hash function used is
880 designed to rapidly achieve avalanche and require only 32x16
881 bit multiplication, and 16-bit swizzles (which we get for
882 free). We can't use immediate operands in the multiplies,
883 because immediates are permitted only in src1 and the 16-bit
884 factor is permitted only in src0. */
885 for( i
= 0; i
< 4; i
++ )
886 brw_MUL( p
, itmp
[ i
], itmp
[ 4 ], itmp
[ i
] );
887 for( i
= 0; i
< 4; i
++ )
888 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
889 high_words( itmp
[ i
] ) );
890 for( i
= 0; i
< 4; i
++ )
891 brw_MUL( p
, itmp
[ i
], itmp
[ 5 ], itmp
[ i
] );
892 for( i
= 0; i
< 4; i
++ )
893 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
894 high_words( itmp
[ i
] ) );
895 for( i
= 0; i
< 4; i
++ )
896 brw_MUL( p
, itmp
[ i
], itmp
[ 6 ], itmp
[ i
] );
897 for( i
= 0; i
< 4; i
++ )
898 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
899 high_words( itmp
[ i
] ) );
901 /* Now we want to initialise the four gradients based on the
902 hashes. Format conversion from signed integer to float leaves
903 everything scaled too high by a factor of pow( 2, 15 ), but
904 we correct for that right at the end. */
905 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
906 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
907 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
908 brw_MOV( p
, x1y0
, low_words( tmp
[ 2 ] ) );
909 brw_MOV( p
, x1y1
, low_words( tmp
[ 3 ] ) );
911 brw_MOV( p
, tmp
[ 0 ], high_words( tmp
[ 0 ] ) );
912 brw_MOV( p
, tmp
[ 1 ], high_words( tmp
[ 1 ] ) );
913 brw_MOV( p
, tmp
[ 2 ], high_words( tmp
[ 2 ] ) );
914 brw_MOV( p
, tmp
[ 3 ], high_words( tmp
[ 3 ] ) );
916 brw_MUL( p
, x1y0
, x1y0
, t
);
917 brw_MUL( p
, x1y1
, x1y1
, t
);
918 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
919 brw_MUL( p
, x0y0
, x0y0
, param0
);
920 brw_MUL( p
, x0y1
, x0y1
, param0
);
922 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param1
);
923 brw_MUL( p
, tmp
[ 2 ], tmp
[ 2 ], param1
);
924 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], t
);
925 brw_MUL( p
, tmp
[ 3 ], tmp
[ 3 ], t
);
927 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 0 ] );
928 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 2 ] );
929 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 1 ] );
930 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 3 ] );
932 /* We interpolate between the gradients using the polynomial
933 6t^5 - 15t^4 + 10t^3 (Perlin). */
934 brw_MUL( p
, tmp
[ 0 ], param0
, brw_imm_f( 6.0 ) );
935 brw_MUL( p
, tmp
[ 1 ], param1
, brw_imm_f( 6.0 ) );
936 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( -15.0 ) );
937 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], brw_imm_f( -15.0 ) );
938 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
939 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
940 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) ); /* unrelated work to fill the
942 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( 10.0 ) );
943 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], brw_imm_f( 10.0 ) );
944 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
945 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
946 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) ); /* unrelated work to fill the
948 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
949 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
950 brw_MUL( p
, param0
, tmp
[ 0 ], param0
);
951 brw_MUL( p
, param1
, tmp
[ 1 ], param1
);
953 /* Here we interpolate in the y dimension... */
954 brw_MUL( p
, x0y1
, x0y1
, param1
);
955 brw_MUL( p
, x1y1
, x1y1
, param1
);
956 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
957 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
959 /* And now in x. There are horrible register dependencies here,
960 but we have nothing else to do. */
961 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
962 brw_MUL( p
, x1y0
, x1y0
, param0
);
963 brw_ADD( p
, x0y0
, x0y0
, x1y0
);
965 /* scale by pow( 2, -15 ), as described above */
966 brw_MUL( p
, param0
, x0y0
, brw_imm_f( 0.000030517578125 ) );
968 release_tmps( c
, mark
);
971 static void emit_noise2( struct brw_wm_compile
*c
,
972 const struct prog_instruction
*inst
)
974 struct brw_compile
*p
= &c
->func
;
975 struct brw_reg src0
, src1
, param0
, param1
, dst
;
976 GLuint mask
= inst
->DstReg
.WriteMask
;
978 int mark
= mark_tmps( c
);
982 src0
= get_src_reg( c
, inst
, 0, 0 );
983 src1
= get_src_reg( c
, inst
, 0, 1 );
985 param0
= alloc_tmp( c
);
986 param1
= alloc_tmp( c
);
988 brw_MOV( p
, param0
, src0
);
989 brw_MOV( p
, param1
, src1
);
991 invoke_subroutine( c
, SUB_NOISE2
, noise2_sub
);
993 /* Fill in the result: */
994 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
995 for (i
= 0 ; i
< 4; i
++) {
997 dst
= get_dst_reg(c
, inst
, i
);
998 brw_MOV( p
, dst
, param0
);
1001 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1002 brw_set_saturate( p
, 0 );
1004 release_tmps( c
, mark
);
1008 * The three-dimensional case is much like the one- and two- versions above,
1009 * but since the number of corners is rapidly growing we now pack 16 16-bit
1010 * hashes into each register to extract more parallelism from the EUs.
1012 static void noise3_sub( struct brw_wm_compile
*c
) {
1014 struct brw_compile
*p
= &c
->func
;
1015 struct brw_reg param0
, param1
, param2
,
1016 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at four of the corners */
1017 xi
, yi
, zi
, /* interpolation coefficients */
1018 t
, tmp
[ 8 ], /* float temporaries */
1019 itmp
[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1020 wtmp
[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1022 int mark
= mark_tmps( c
);
1024 x0y0
= alloc_tmp( c
);
1025 x0y1
= alloc_tmp( c
);
1026 x1y0
= alloc_tmp( c
);
1027 x1y1
= alloc_tmp( c
);
1028 xi
= alloc_tmp( c
);
1029 yi
= alloc_tmp( c
);
1030 zi
= alloc_tmp( c
);
1032 for( i
= 0; i
< 8; i
++ ) {
1033 tmp
[ i
] = alloc_tmp( c
);
1034 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1035 wtmp
[ i
] = brw_uw16_grf( tmp
[ i
].nr
, 0 );
1038 param0
= lookup_tmp( c
, mark
- 4 );
1039 param1
= lookup_tmp( c
, mark
- 3 );
1040 param2
= lookup_tmp( c
, mark
- 2 );
1042 brw_set_access_mode( p
, BRW_ALIGN_1
);
1044 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1045 be hashed. Also compute the remainders (offsets within the unit
1046 cube), interleaved to reduce register dependency penalties. */
1047 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param0
);
1048 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param1
);
1049 brw_RNDD( p
, retype( itmp
[ 2 ], BRW_REGISTER_TYPE_D
), param2
);
1050 brw_FRC( p
, param0
, param0
);
1051 brw_FRC( p
, param1
, param1
);
1052 brw_FRC( p
, param2
, param2
);
1053 /* Since we now have only 16 bits of precision in the hash, we must
1054 be more careful about thorough mixing to maintain entropy as we
1055 squash the input vector into a small scalar. */
1056 brw_MUL( p
, brw_null_reg(), low_words( itmp
[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1057 brw_MAC( p
, brw_null_reg(), low_words( itmp
[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1058 brw_MAC( p
, low_words( itmp
[ 0 ] ), low_words( itmp
[ 2 ] ),
1059 brw_imm_uw( 0x9B93 ) );
1060 brw_ADD( p
, high_words( itmp
[ 0 ] ), low_words( itmp
[ 0 ] ),
1061 brw_imm_uw( 0xBC8F ) );
1063 /* Temporarily disable the execution mask while we work with ExecSize=16
1064 channels (the mask is set for ExecSize=8 and is probably incorrect).
1065 Although this might cause execution of unwanted channels, the code
1066 writes only to temporary registers and has no side effects, so
1067 disabling the mask is harmless. */
1068 brw_push_insn_state( p
);
1069 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1070 brw_ADD( p
, wtmp
[ 1 ], wtmp
[ 0 ], brw_imm_uw( 0xD0BD ) );
1071 brw_ADD( p
, wtmp
[ 2 ], wtmp
[ 0 ], brw_imm_uw( 0x9B93 ) );
1072 brw_ADD( p
, wtmp
[ 3 ], wtmp
[ 1 ], brw_imm_uw( 0x9B93 ) );
1074 /* We're now ready to perform the hashing. The eight hashes are
1075 interleaved for performance. The hash function used is
1076 designed to rapidly achieve avalanche and require only 16x16
1077 bit multiplication, and 8-bit swizzles (which we get for
1079 for( i
= 0; i
< 4; i
++ )
1080 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0x28D9 ) );
1081 for( i
= 0; i
< 4; i
++ )
1082 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1083 odd_bytes( wtmp
[ i
] ) );
1084 for( i
= 0; i
< 4; i
++ )
1085 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0xC6D5 ) );
1086 for( i
= 0; i
< 4; i
++ )
1087 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1088 odd_bytes( wtmp
[ i
] ) );
1089 brw_pop_insn_state( p
);
1091 /* Now we want to initialise the four rear gradients based on the
1092 hashes. Format conversion from signed integer to float leaves
1093 everything scaled too high by a factor of pow( 2, 15 ), but
1094 we correct for that right at the end. */
1096 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1097 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1098 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1099 brw_MOV( p
, x1y0
, high_words( tmp
[ 0 ] ) );
1100 brw_MOV( p
, x1y1
, high_words( tmp
[ 1 ] ) );
1102 brw_push_insn_state( p
);
1103 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1104 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 5 ) );
1105 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 5 ) );
1106 brw_pop_insn_state( p
);
1108 brw_MUL( p
, x1y0
, x1y0
, t
);
1109 brw_MUL( p
, x1y1
, x1y1
, t
);
1110 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1111 brw_MUL( p
, x0y0
, x0y0
, param0
);
1112 brw_MUL( p
, x0y1
, x0y1
, param0
);
1115 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1116 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1117 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1118 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1120 brw_push_insn_state( p
);
1121 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1122 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 5 ) );
1123 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 5 ) );
1124 brw_pop_insn_state( p
);
1126 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1127 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1128 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1129 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param1
);
1130 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param1
);
1132 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1133 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1134 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1135 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1138 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1139 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1140 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1141 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1143 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param2
);
1144 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], param2
);
1145 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param2
);
1146 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], param2
);
1148 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1149 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1150 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1151 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1153 /* We interpolate between the gradients using the polynomial
1154 6t^5 - 15t^4 + 10t^3 (Perlin). */
1155 brw_MUL( p
, xi
, param0
, brw_imm_f( 6.0 ) );
1156 brw_MUL( p
, yi
, param1
, brw_imm_f( 6.0 ) );
1157 brw_MUL( p
, zi
, param2
, brw_imm_f( 6.0 ) );
1158 brw_ADD( p
, xi
, xi
, brw_imm_f( -15.0 ) );
1159 brw_ADD( p
, yi
, yi
, brw_imm_f( -15.0 ) );
1160 brw_ADD( p
, zi
, zi
, brw_imm_f( -15.0 ) );
1161 brw_MUL( p
, xi
, xi
, param0
);
1162 brw_MUL( p
, yi
, yi
, param1
);
1163 brw_MUL( p
, zi
, zi
, param2
);
1164 brw_ADD( p
, xi
, xi
, brw_imm_f( 10.0 ) );
1165 brw_ADD( p
, yi
, yi
, brw_imm_f( 10.0 ) );
1166 brw_ADD( p
, zi
, zi
, brw_imm_f( 10.0 ) );
1167 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) ); /* unrelated work */
1168 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) ); /* unrelated work */
1169 brw_MUL( p
, xi
, xi
, param0
);
1170 brw_MUL( p
, yi
, yi
, param1
);
1171 brw_MUL( p
, zi
, zi
, param2
);
1172 brw_MUL( p
, xi
, xi
, param0
);
1173 brw_MUL( p
, yi
, yi
, param1
);
1174 brw_MUL( p
, zi
, zi
, param2
);
1175 brw_MUL( p
, xi
, xi
, param0
);
1176 brw_MUL( p
, yi
, yi
, param1
);
1177 brw_MUL( p
, zi
, zi
, param2
);
1179 /* Here we interpolate in the y dimension... */
1180 brw_MUL( p
, x0y1
, x0y1
, yi
);
1181 brw_MUL( p
, x1y1
, x1y1
, yi
);
1182 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1183 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1185 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1186 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1187 brw_MUL( p
, x1y0
, x1y0
, xi
);
1188 brw_ADD( p
, tmp
[ 0 ], x0y0
, x1y0
);
1190 /* Now do the same thing for the front four gradients... */
1192 brw_MOV( p
, x0y0
, low_words( tmp
[ 2 ] ) );
1193 brw_MOV( p
, x0y1
, low_words( tmp
[ 3 ] ) );
1194 brw_MOV( p
, x1y0
, high_words( tmp
[ 2 ] ) );
1195 brw_MOV( p
, x1y1
, high_words( tmp
[ 3 ] ) );
1197 brw_push_insn_state( p
);
1198 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1199 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 5 ) );
1200 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 5 ) );
1201 brw_pop_insn_state( p
);
1203 brw_MUL( p
, x1y0
, x1y0
, t
);
1204 brw_MUL( p
, x1y1
, x1y1
, t
);
1205 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1206 brw_MUL( p
, x0y0
, x0y0
, param0
);
1207 brw_MUL( p
, x0y1
, x0y1
, param0
);
1210 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1211 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1212 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1213 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1215 brw_push_insn_state( p
);
1216 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1217 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 5 ) );
1218 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 5 ) );
1219 brw_pop_insn_state( p
);
1221 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1222 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1223 brw_ADD( p
, t
, param2
, brw_imm_f( -1.0 ) );
1224 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param1
);
1225 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param1
);
1227 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1228 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1229 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1230 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1233 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1234 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1235 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1236 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1238 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1239 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1240 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1241 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1243 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1244 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1245 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1246 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1248 /* The interpolation coefficients are still around from last time, so
1249 again interpolate in the y dimension... */
1250 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1251 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1252 brw_MUL( p
, x0y1
, x0y1
, yi
);
1253 brw_MUL( p
, x1y1
, x1y1
, yi
);
1254 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1255 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1257 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1258 time put the front face in tmp[ 1 ] and we're nearly there... */
1259 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1260 brw_MUL( p
, x1y0
, x1y0
, xi
);
1261 brw_ADD( p
, tmp
[ 1 ], x0y0
, x1y0
);
1263 /* The final interpolation, in the z dimension: */
1264 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], negate( tmp
[ 0 ] ) );
1265 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], zi
);
1266 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], tmp
[ 1 ] );
1268 /* scale by pow( 2, -15 ), as described above */
1269 brw_MUL( p
, param0
, tmp
[ 0 ], brw_imm_f( 0.000030517578125 ) );
1271 release_tmps( c
, mark
);
1274 static void emit_noise3( struct brw_wm_compile
*c
,
1275 const struct prog_instruction
*inst
)
1277 struct brw_compile
*p
= &c
->func
;
1278 struct brw_reg src0
, src1
, src2
, param0
, param1
, param2
, dst
;
1279 GLuint mask
= inst
->DstReg
.WriteMask
;
1281 int mark
= mark_tmps( c
);
1283 assert( mark
== 0 );
1285 src0
= get_src_reg( c
, inst
, 0, 0 );
1286 src1
= get_src_reg( c
, inst
, 0, 1 );
1287 src2
= get_src_reg( c
, inst
, 0, 2 );
1289 param0
= alloc_tmp( c
);
1290 param1
= alloc_tmp( c
);
1291 param2
= alloc_tmp( c
);
1293 brw_MOV( p
, param0
, src0
);
1294 brw_MOV( p
, param1
, src1
);
1295 brw_MOV( p
, param2
, src2
);
1297 invoke_subroutine( c
, SUB_NOISE3
, noise3_sub
);
1299 /* Fill in the result: */
1300 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1301 for (i
= 0 ; i
< 4; i
++) {
1302 if (mask
& (1<<i
)) {
1303 dst
= get_dst_reg(c
, inst
, i
);
1304 brw_MOV( p
, dst
, param0
);
1307 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1308 brw_set_saturate( p
, 0 );
1310 release_tmps( c
, mark
);
1314 * For the four-dimensional case, the little micro-optimisation benefits
1315 * we obtain by unrolling all the loops aren't worth the massive bloat it
1316 * now causes. Instead, we loop twice around performing a similar operation
1317 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1318 * code to glue it all together.
1320 static void noise4_sub( struct brw_wm_compile
*c
)
1322 struct brw_compile
*p
= &c
->func
;
1323 struct brw_reg param
[ 4 ],
1324 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at four of the corners */
1325 w0
, /* noise for the w=0 cube */
1326 floors
[ 2 ], /* integer coordinates of base corner of hypercube */
1327 interp
[ 4 ], /* interpolation coefficients */
1328 t
, tmp
[ 8 ], /* float temporaries */
1329 itmp
[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1330 wtmp
[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1332 int mark
= mark_tmps( c
);
1333 GLuint loop
, origin
;
1335 x0y0
= alloc_tmp( c
);
1336 x0y1
= alloc_tmp( c
);
1337 x1y0
= alloc_tmp( c
);
1338 x1y1
= alloc_tmp( c
);
1340 w0
= alloc_tmp( c
);
1341 floors
[ 0 ] = retype( alloc_tmp( c
), BRW_REGISTER_TYPE_UD
);
1342 floors
[ 1 ] = retype( alloc_tmp( c
), BRW_REGISTER_TYPE_UD
);
1344 for( i
= 0; i
< 4; i
++ ) {
1345 param
[ i
] = lookup_tmp( c
, mark
- 5 + i
);
1346 interp
[ i
] = alloc_tmp( c
);
1349 for( i
= 0; i
< 8; i
++ ) {
1350 tmp
[ i
] = alloc_tmp( c
);
1351 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1352 wtmp
[ i
] = brw_uw16_grf( tmp
[ i
].nr
, 0 );
1355 brw_set_access_mode( p
, BRW_ALIGN_1
);
1357 /* We only want 16 bits of precision from the integral part of each
1358 co-ordinate, but unfortunately the RNDD semantics would saturate
1359 at 16 bits if we performed the operation directly to a 16-bit
1360 destination. Therefore, we round to 32-bit temporaries where
1361 appropriate, and then store only the lower 16 bits. */
1362 brw_RNDD( p
, retype( floors
[ 0 ], BRW_REGISTER_TYPE_D
), param
[ 0 ] );
1363 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param
[ 1 ] );
1364 brw_RNDD( p
, retype( floors
[ 1 ], BRW_REGISTER_TYPE_D
), param
[ 2 ] );
1365 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param
[ 3 ] );
1366 brw_MOV( p
, high_words( floors
[ 0 ] ), low_words( itmp
[ 0 ] ) );
1367 brw_MOV( p
, high_words( floors
[ 1 ] ), low_words( itmp
[ 1 ] ) );
1369 /* Modify the flag register here, because the side effect is useful
1370 later (see below). We know for certain that all flags will be
1371 cleared, since the FRC instruction cannot possibly generate
1372 negative results. Even for exceptional inputs (infinities, denormals,
1373 NaNs), the architecture guarantees that the L conditional is false. */
1374 brw_set_conditionalmod( p
, BRW_CONDITIONAL_L
);
1375 brw_FRC( p
, param
[ 0 ], param
[ 0 ] );
1376 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1377 for( i
= 1; i
< 4; i
++ )
1378 brw_FRC( p
, param
[ i
], param
[ i
] );
1380 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1382 for( i
= 0; i
< 4; i
++ )
1383 brw_MUL( p
, interp
[ i
], param
[ i
], brw_imm_f( 6.0 ) );
1384 for( i
= 0; i
< 4; i
++ )
1385 brw_ADD( p
, interp
[ i
], interp
[ i
], brw_imm_f( -15.0 ) );
1386 for( i
= 0; i
< 4; i
++ )
1387 brw_MUL( p
, interp
[ i
], interp
[ i
], param
[ i
] );
1388 for( i
= 0; i
< 4; i
++ )
1389 brw_ADD( p
, interp
[ i
], interp
[ i
], brw_imm_f( 10.0 ) );
1390 for( j
= 0; j
< 3; j
++ )
1391 for( i
= 0; i
< 4; i
++ )
1392 brw_MUL( p
, interp
[ i
], interp
[ i
], param
[ i
] );
1394 /* Mark the current address, as it will be a jump destination. The
1395 following code will be executed twice: first, with the flag
1396 register clear indicating the w=0 case, and second with flags
1400 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1401 be hashed. Since we have only 16 bits of precision in the hash, we
1402 must be careful about thorough mixing to maintain entropy as we
1403 squash the input vector into a small scalar. */
1404 brw_MUL( p
, brw_null_reg(), low_words( floors
[ 0 ] ),
1405 brw_imm_uw( 0xBC8F ) );
1406 brw_MAC( p
, brw_null_reg(), high_words( floors
[ 0 ] ),
1407 brw_imm_uw( 0xD0BD ) );
1408 brw_MAC( p
, brw_null_reg(), low_words( floors
[ 1 ] ),
1409 brw_imm_uw( 0x9B93 ) );
1410 brw_MAC( p
, low_words( itmp
[ 0 ] ), high_words( floors
[ 1 ] ),
1411 brw_imm_uw( 0xA359 ) );
1412 brw_ADD( p
, high_words( itmp
[ 0 ] ), low_words( itmp
[ 0 ] ),
1413 brw_imm_uw( 0xBC8F ) );
1415 /* Temporarily disable the execution mask while we work with ExecSize=16
1416 channels (the mask is set for ExecSize=8 and is probably incorrect).
1417 Although this might cause execution of unwanted channels, the code
1418 writes only to temporary registers and has no side effects, so
1419 disabling the mask is harmless. */
1420 brw_push_insn_state( p
);
1421 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1422 brw_ADD( p
, wtmp
[ 1 ], wtmp
[ 0 ], brw_imm_uw( 0xD0BD ) );
1423 brw_ADD( p
, wtmp
[ 2 ], wtmp
[ 0 ], brw_imm_uw( 0x9B93 ) );
1424 brw_ADD( p
, wtmp
[ 3 ], wtmp
[ 1 ], brw_imm_uw( 0x9B93 ) );
1426 /* We're now ready to perform the hashing. The eight hashes are
1427 interleaved for performance. The hash function used is
1428 designed to rapidly achieve avalanche and require only 16x16
1429 bit multiplication, and 8-bit swizzles (which we get for
1431 for( i
= 0; i
< 4; i
++ )
1432 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0x28D9 ) );
1433 for( i
= 0; i
< 4; i
++ )
1434 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1435 odd_bytes( wtmp
[ i
] ) );
1436 for( i
= 0; i
< 4; i
++ )
1437 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0xC6D5 ) );
1438 for( i
= 0; i
< 4; i
++ )
1439 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1440 odd_bytes( wtmp
[ i
] ) );
1441 brw_pop_insn_state( p
);
1443 /* Now we want to initialise the four rear gradients based on the
1444 hashes. Format conversion from signed integer to float leaves
1445 everything scaled too high by a factor of pow( 2, 15 ), but
1446 we correct for that right at the end. */
1448 brw_ADD( p
, t
, param
[ 0 ], brw_imm_f( -1.0 ) );
1449 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1450 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1451 brw_MOV( p
, x1y0
, high_words( tmp
[ 0 ] ) );
1452 brw_MOV( p
, x1y1
, high_words( tmp
[ 1 ] ) );
1454 brw_push_insn_state( p
);
1455 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1456 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1457 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1458 brw_pop_insn_state( p
);
1460 brw_MUL( p
, x1y0
, x1y0
, t
);
1461 brw_MUL( p
, x1y1
, x1y1
, t
);
1462 brw_ADD( p
, t
, param
[ 1 ], brw_imm_f( -1.0 ) );
1463 brw_MUL( p
, x0y0
, x0y0
, param
[ 0 ] );
1464 brw_MUL( p
, x0y1
, x0y1
, param
[ 0 ] );
1467 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1468 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1469 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1470 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1472 brw_push_insn_state( p
);
1473 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1474 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1475 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1476 brw_pop_insn_state( p
);
1478 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1479 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1480 /* prepare t for the w component (used below): w the first time through
1481 the loop; w - 1 the second time) */
1482 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1483 brw_ADD( p
, t
, param
[ 3 ], brw_imm_f( -1.0 ) );
1484 p
->current
->header
.predicate_inverse
= 1;
1485 brw_MOV( p
, t
, param
[ 3 ] );
1486 p
->current
->header
.predicate_inverse
= 0;
1487 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1488 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 1 ] );
1489 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 1 ] );
1491 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1492 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1493 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1494 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1497 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1498 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1499 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1500 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1502 brw_push_insn_state( p
);
1503 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1504 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1505 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1506 brw_pop_insn_state( p
);
1508 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 2 ] );
1509 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], param
[ 2 ] );
1510 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 2 ] );
1511 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], param
[ 2 ] );
1513 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1514 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1515 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1516 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1519 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1520 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1521 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1522 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1524 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1525 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1526 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1527 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1528 brw_ADD( p
, t
, param
[ 0 ], brw_imm_f( -1.0 ) );
1530 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1531 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1532 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1533 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1535 /* Here we interpolate in the y dimension... */
1536 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1537 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1538 brw_MUL( p
, x0y1
, x0y1
, interp
[ 1 ] );
1539 brw_MUL( p
, x1y1
, x1y1
, interp
[ 1 ] );
1540 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1541 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1543 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1544 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1545 brw_MUL( p
, x1y0
, x1y0
, interp
[ 0 ] );
1546 brw_ADD( p
, tmp
[ 0 ], x0y0
, x1y0
);
1548 /* Now do the same thing for the front four gradients... */
1550 brw_MOV( p
, x0y0
, low_words( tmp
[ 2 ] ) );
1551 brw_MOV( p
, x0y1
, low_words( tmp
[ 3 ] ) );
1552 brw_MOV( p
, x1y0
, high_words( tmp
[ 2 ] ) );
1553 brw_MOV( p
, x1y1
, high_words( tmp
[ 3 ] ) );
1555 brw_push_insn_state( p
);
1556 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1557 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1558 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1559 brw_pop_insn_state( p
);
1561 brw_MUL( p
, x1y0
, x1y0
, t
);
1562 brw_MUL( p
, x1y1
, x1y1
, t
);
1563 brw_ADD( p
, t
, param
[ 1 ], brw_imm_f( -1.0 ) );
1564 brw_MUL( p
, x0y0
, x0y0
, param
[ 0 ] );
1565 brw_MUL( p
, x0y1
, x0y1
, param
[ 0 ] );
1568 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1569 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1570 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1571 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1573 brw_push_insn_state( p
);
1574 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1575 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1576 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1577 brw_pop_insn_state( p
);
1579 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1580 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1581 brw_ADD( p
, t
, param
[ 2 ], brw_imm_f( -1.0 ) );
1582 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 1 ] );
1583 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 1 ] );
1585 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1586 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1587 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1588 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1591 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1592 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1593 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1594 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1596 brw_push_insn_state( p
);
1597 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1598 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1599 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1600 brw_pop_insn_state( p
);
1602 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1603 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1604 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1605 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1606 /* prepare t for the w component (used below): w the first time through
1607 the loop; w - 1 the second time) */
1608 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1609 brw_ADD( p
, t
, param
[ 3 ], brw_imm_f( -1.0 ) );
1610 p
->current
->header
.predicate_inverse
= 1;
1611 brw_MOV( p
, t
, param
[ 3 ] );
1612 p
->current
->header
.predicate_inverse
= 0;
1613 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1615 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1616 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1617 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1618 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1621 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1622 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1623 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1624 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1626 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1627 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1628 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1629 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1631 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1632 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1633 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1634 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1636 /* Interpolate in the y dimension: */
1637 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1638 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1639 brw_MUL( p
, x0y1
, x0y1
, interp
[ 1 ] );
1640 brw_MUL( p
, x1y1
, x1y1
, interp
[ 1 ] );
1641 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1642 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1644 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1645 time put the front face in tmp[ 1 ] and we're nearly there... */
1646 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1647 brw_MUL( p
, x1y0
, x1y0
, interp
[ 0 ] );
1648 brw_ADD( p
, tmp
[ 1 ], x0y0
, x1y0
);
1650 /* Another interpolation, in the z dimension: */
1651 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], negate( tmp
[ 0 ] ) );
1652 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], interp
[ 2 ] );
1653 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], tmp
[ 1 ] );
1655 /* Exit the loop if we've computed both cubes... */
1656 origin
= p
->nr_insn
;
1657 brw_push_insn_state( p
);
1658 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1659 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1660 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1661 brw_pop_insn_state( p
);
1663 /* Save the result for the w=0 case, and increment the w coordinate: */
1664 brw_MOV( p
, w0
, tmp
[ 0 ] );
1665 brw_ADD( p
, high_words( floors
[ 1 ] ), high_words( floors
[ 1 ] ),
1668 /* Loop around for the other cube. Explicitly set the flag register
1669 (unfortunately we must spend an extra instruction to do this: we
1670 can't rely on a side effect of the previous MOV or ADD because
1671 conditional modifiers which are normally true might be false in
1672 exceptional circumstances, e.g. given a NaN input; the add to
1673 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1674 brw_push_insn_state( p
);
1675 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1676 brw_MOV( p
, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1677 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(),
1678 brw_imm_d( ( loop
- p
->nr_insn
) << 4 ) );
1679 brw_pop_insn_state( p
);
1681 /* Patch the previous conditional branch now that we know the
1682 destination address. */
1683 brw_set_src1( p
->store
+ origin
,
1684 brw_imm_d( ( p
->nr_insn
- origin
) << 4 ) );
1686 /* The very last interpolation. */
1687 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], negate( w0
) );
1688 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], interp
[ 3 ] );
1689 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], w0
);
1691 /* scale by pow( 2, -15 ), as described above */
1692 brw_MUL( p
, param
[ 0 ], tmp
[ 0 ], brw_imm_f( 0.000030517578125 ) );
1694 release_tmps( c
, mark
);
1697 static void emit_noise4( struct brw_wm_compile
*c
,
1698 const struct prog_instruction
*inst
)
1700 struct brw_compile
*p
= &c
->func
;
1701 struct brw_reg src0
, src1
, src2
, src3
, param0
, param1
, param2
, param3
, dst
;
1702 GLuint mask
= inst
->DstReg
.WriteMask
;
1704 int mark
= mark_tmps( c
);
1706 assert( mark
== 0 );
1708 src0
= get_src_reg( c
, inst
, 0, 0 );
1709 src1
= get_src_reg( c
, inst
, 0, 1 );
1710 src2
= get_src_reg( c
, inst
, 0, 2 );
1711 src3
= get_src_reg( c
, inst
, 0, 3 );
1713 param0
= alloc_tmp( c
);
1714 param1
= alloc_tmp( c
);
1715 param2
= alloc_tmp( c
);
1716 param3
= alloc_tmp( c
);
1718 brw_MOV( p
, param0
, src0
);
1719 brw_MOV( p
, param1
, src1
);
1720 brw_MOV( p
, param2
, src2
);
1721 brw_MOV( p
, param3
, src3
);
1723 invoke_subroutine( c
, SUB_NOISE4
, noise4_sub
);
1725 /* Fill in the result: */
1726 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1727 for (i
= 0 ; i
< 4; i
++) {
1728 if (mask
& (1<<i
)) {
1729 dst
= get_dst_reg(c
, inst
, i
);
1730 brw_MOV( p
, dst
, param0
);
1733 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1734 brw_set_saturate( p
, 0 );
1736 release_tmps( c
, mark
);
1740 * Resolve subroutine calls after code emit is done.
1742 static void post_wm_emit( struct brw_wm_compile
*c
)
1744 brw_resolve_cals(&c
->func
);
1748 get_argument_regs(struct brw_wm_compile
*c
,
1749 const struct prog_instruction
*inst
,
1751 struct brw_reg
*dst
,
1752 struct brw_reg
*regs
,
1755 struct brw_compile
*p
= &c
->func
;
1758 for (i
= 0; i
< 4; i
++) {
1759 if (mask
& (1 << i
)) {
1760 regs
[i
] = get_src_reg(c
, inst
, index
, i
);
1762 /* Unalias destination registers from our sources. */
1763 if (regs
[i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1764 for (j
= 0; j
< 4; j
++) {
1765 if (memcmp(®s
[i
], &dst
[j
], sizeof(regs
[0])) == 0) {
1766 struct brw_reg tmp
= alloc_tmp(c
);
1767 brw_MOV(p
, tmp
, regs
[i
]);
1777 static void brw_wm_emit_glsl(struct brw_context
*brw
, struct brw_wm_compile
*c
)
1779 struct intel_context
*intel
= &brw
->intel
;
1780 #define MAX_IF_DEPTH 32
1781 #define MAX_LOOP_DEPTH 32
1782 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
];
1783 GLuint i
, if_depth
= 0, loop_depth
= 0;
1784 struct brw_compile
*p
= &c
->func
;
1785 struct brw_indirect stack_index
= brw_indirect(0, 0);
1787 c
->out_of_regs
= GL_FALSE
;
1790 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1791 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1793 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
1794 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
1796 struct brw_reg args
[3][4], dst
[4];
1798 int mark
= mark_tmps( c
);
1803 printf("Inst %d: ", i
);
1804 _mesa_print_instruction(inst
);
1807 /* fetch any constants that this instruction needs */
1808 if (c
->fp
->use_const_buffer
)
1809 fetch_constants(c
, inst
);
1811 if (inst
->Opcode
!= OPCODE_ARL
) {
1812 for (j
= 0; j
< 4; j
++) {
1813 if (inst
->DstReg
.WriteMask
& (1 << j
))
1814 dst
[j
] = get_dst_reg(c
, inst
, j
);
1816 dst
[j
] = brw_null_reg();
1819 for (j
= 0; j
< brw_wm_nr_args(inst
->Opcode
); j
++)
1820 get_argument_regs(c
, inst
, j
, dst
, args
[j
], WRITEMASK_XYZW
);
1822 dst_flags
= inst
->DstReg
.WriteMask
;
1823 if (inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1824 dst_flags
|= SATURATE
;
1826 if (inst
->CondUpdate
)
1827 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NZ
);
1829 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
1831 switch (inst
->Opcode
) {
1833 emit_pixel_xy(c
, dst
, dst_flags
);
1836 emit_delta_xy(p
, dst
, dst_flags
, args
[0]);
1839 emit_pixel_w(c
, dst
, dst_flags
, args
[0], args
[1]);
1842 emit_linterp(p
, dst
, dst_flags
, args
[0], args
[1]);
1845 emit_pinterp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1848 emit_cinterp(p
, dst
, dst_flags
, args
[0]);
1851 emit_wpos_xy(c
, dst
, dst_flags
, args
[0]);
1854 emit_fb_write(c
, args
[0], args
[1], args
[2],
1855 INST_AUX_GET_TARGET(inst
->Aux
),
1856 inst
->Aux
& INST_AUX_EOT
);
1858 case WM_FRONTFACING
:
1859 emit_frontfacing(p
, dst
, dst_flags
);
1862 emit_alu2(p
, brw_ADD
, dst
, dst_flags
, args
[0], args
[1]);
1868 emit_alu1(p
, brw_FRC
, dst
, dst_flags
, args
[0]);
1871 emit_alu1(p
, brw_RNDD
, dst
, dst_flags
, args
[0]);
1874 emit_lrp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1877 emit_alu1(p
, brw_RNDZ
, dst
, dst_flags
, args
[0]);
1881 emit_alu1(p
, brw_MOV
, dst
, dst_flags
, args
[0]);
1884 emit_dp3(p
, dst
, dst_flags
, args
[0], args
[1]);
1887 emit_dp4(p
, dst
, dst_flags
, args
[0], args
[1]);
1890 emit_xpd(p
, dst
, dst_flags
, args
[0], args
[1]);
1893 emit_dph(p
, dst
, dst_flags
, args
[0], args
[1]);
1896 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, dst_flags
, args
[0]);
1899 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, dst_flags
, args
[0]);
1902 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, dst_flags
, args
[0]);
1905 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, dst_flags
, args
[0]);
1908 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, dst_flags
, args
[0]);
1911 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, dst_flags
, args
[0]);
1914 emit_cmp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1917 emit_min(p
, dst
, dst_flags
, args
[0], args
[1]);
1920 emit_max(p
, dst
, dst_flags
, args
[0], args
[1]);
1924 emit_ddxy(p
, dst
, dst_flags
, (inst
->Opcode
== OPCODE_DDX
),
1928 emit_sop(p
, dst
, dst_flags
,
1929 BRW_CONDITIONAL_L
, args
[0], args
[1]);
1932 emit_sop(p
, dst
, dst_flags
,
1933 BRW_CONDITIONAL_LE
, args
[0], args
[1]);
1936 emit_sop(p
, dst
, dst_flags
,
1937 BRW_CONDITIONAL_G
, args
[0], args
[1]);
1940 emit_sop(p
, dst
, dst_flags
,
1941 BRW_CONDITIONAL_GE
, args
[0], args
[1]);
1944 emit_sop(p
, dst
, dst_flags
,
1945 BRW_CONDITIONAL_EQ
, args
[0], args
[1]);
1948 emit_sop(p
, dst
, dst_flags
,
1949 BRW_CONDITIONAL_NEQ
, args
[0], args
[1]);
1952 emit_alu2(p
, brw_MUL
, dst
, dst_flags
, args
[0], args
[1]);
1955 emit_math2(c
, BRW_MATH_FUNCTION_POW
,
1956 dst
, dst_flags
, args
[0], args
[1]);
1959 emit_mad(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1962 emit_noise1(c
, inst
);
1965 emit_noise2(c
, inst
);
1968 emit_noise3(c
, inst
);
1971 emit_noise4(c
, inst
);
1974 emit_tex(c
, dst
, dst_flags
, args
[0],
1975 get_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
,
1979 (c
->key
.shadowtex_mask
& (1 << inst
->TexSrcUnit
)) != 0);
1982 emit_txb(c
, dst
, dst_flags
, args
[0],
1983 get_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
,
1986 c
->fp
->program
.Base
.SamplerUnits
[inst
->TexSrcUnit
]);
1992 assert(if_depth
< MAX_IF_DEPTH
);
1993 if_inst
[if_depth
++] = brw_IF(p
, BRW_EXECUTE_8
);
1996 assert(if_depth
> 0);
1997 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
2000 assert(if_depth
> 0);
2001 brw_ENDIF(p
, if_inst
[--if_depth
]);
2004 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2010 brw_push_insn_state(p
);
2011 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
2012 brw_set_access_mode(p
, BRW_ALIGN_1
);
2013 brw_ADD(p
, deref_1ud(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2014 brw_set_access_mode(p
, BRW_ALIGN_16
);
2015 brw_ADD(p
, get_addr_reg(stack_index
),
2016 get_addr_reg(stack_index
), brw_imm_d(4));
2017 brw_save_call(&c
->func
, inst
->Comment
, p
->nr_insn
);
2018 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2019 brw_pop_insn_state(p
);
2023 brw_push_insn_state(p
);
2024 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
2025 brw_ADD(p
, get_addr_reg(stack_index
),
2026 get_addr_reg(stack_index
), brw_imm_d(-4));
2027 brw_set_access_mode(p
, BRW_ALIGN_1
);
2028 brw_MOV(p
, brw_ip_reg(), deref_1ud(stack_index
, 0));
2029 brw_set_access_mode(p
, BRW_ALIGN_16
);
2030 brw_pop_insn_state(p
);
2033 case OPCODE_BGNLOOP
:
2034 /* XXX may need to invalidate the current_constant regs */
2035 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2039 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2043 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2045 case OPCODE_ENDLOOP
:
2047 struct brw_instruction
*inst0
, *inst1
;
2050 if (intel
->is_ironlake
)
2053 assert(loop_depth
> 0);
2055 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2056 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2057 while (inst0
> loop_inst
[loop_depth
]) {
2059 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
2060 inst0
->bits3
.if_else
.jump_count
== 0) {
2061 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2062 inst0
->bits3
.if_else
.pop_count
= 0;
2064 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
2065 inst0
->bits3
.if_else
.jump_count
== 0) {
2066 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2067 inst0
->bits3
.if_else
.pop_count
= 0;
2073 printf("unsupported opcode %d (%s) in fragment shader\n",
2074 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
2075 _mesa_opcode_string(inst
->Opcode
) : "unknown");
2078 /* Release temporaries containing any unaliased source regs. */
2079 release_tmps( c
, mark
);
2081 if (inst
->CondUpdate
)
2082 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
2084 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2088 if (INTEL_DEBUG
& DEBUG_WM
) {
2089 printf("wm-native:\n");
2090 for (i
= 0; i
< p
->nr_insn
; i
++)
2091 brw_disasm(stderr
, &p
->store
[i
]);
2097 * Do GPU code generation for shaders that use GLSL features such as
2098 * flow control. Other shaders will be compiled with the
2100 void brw_wm_glsl_emit(struct brw_context
*brw
, struct brw_wm_compile
*c
)
2102 if (INTEL_DEBUG
& DEBUG_WM
) {
2103 printf("brw_wm_glsl_emit:\n");
2106 /* initial instruction translation/simplification */
2109 /* actual code generation */
2110 brw_wm_emit_glsl(brw
, c
);
2112 if (INTEL_DEBUG
& DEBUG_WM
) {
2113 brw_wm_print_program(c
, "brw_wm_glsl_emit done");
2116 c
->prog_data
.total_grf
= num_grf_used(c
);
2117 c
->prog_data
.total_scratch
= 0;