1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
10 SUB_NOISE1
, SUB_NOISE2
, SUB_NOISE3
, SUB_NOISE4
13 static struct brw_reg
get_dst_reg(struct brw_wm_compile
*c
,
14 const struct prog_instruction
*inst
,
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
22 GLboolean
brw_wm_is_glsl(const struct gl_fragment_program
*fp
)
26 for (i
= 0; i
< fp
->Base
.NumInstructions
; i
++) {
27 const struct prog_instruction
*inst
= &fp
->Base
.Instructions
[i
];
28 switch (inst
->Opcode
) {
51 reclaim_temps(struct brw_wm_compile
*c
);
54 /** Mark GRF register as used. */
56 prealloc_grf(struct brw_wm_compile
*c
, int r
)
58 c
->used_grf
[r
] = GL_TRUE
;
62 /** Mark given GRF register as not in use. */
64 release_grf(struct brw_wm_compile
*c
, int r
)
66 /*assert(c->used_grf[r]);*/
67 c
->used_grf
[r
] = GL_FALSE
;
68 c
->first_free_grf
= MIN2(c
->first_free_grf
, r
);
72 /** Return index of a free GRF, mark it as used. */
74 alloc_grf(struct brw_wm_compile
*c
)
77 for (r
= c
->first_free_grf
; r
< BRW_WM_MAX_GRF
; r
++) {
78 if (!c
->used_grf
[r
]) {
79 c
->used_grf
[r
] = GL_TRUE
;
80 c
->first_free_grf
= r
+ 1; /* a guess */
85 /* no free temps, try to reclaim some */
87 c
->first_free_grf
= 0;
90 for (r
= c
->first_free_grf
; r
< BRW_WM_MAX_GRF
; r
++) {
91 if (!c
->used_grf
[r
]) {
92 c
->used_grf
[r
] = GL_TRUE
;
93 c
->first_free_grf
= r
+ 1; /* a guess */
98 for (r
= 0; r
< BRW_WM_MAX_GRF
; r
++) {
99 assert(c
->used_grf
[r
]);
102 /* really, no free GRF regs found */
103 if (!c
->out_of_regs
) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL
, "i965: ran out of registers for fragment program");
106 c
->out_of_regs
= GL_TRUE
;
113 /** Return number of GRF registers used */
115 num_grf_used(const struct brw_wm_compile
*c
)
118 for (r
= BRW_WM_MAX_GRF
- 1; r
>= 0; r
--)
127 * Record the mapping of a Mesa register to a hardware register.
129 static void set_reg(struct brw_wm_compile
*c
, int file
, int index
,
130 int component
, struct brw_reg reg
)
132 c
->wm_regs
[file
][index
][component
].reg
= reg
;
133 c
->wm_regs
[file
][index
][component
].inited
= GL_TRUE
;
136 static struct brw_reg
alloc_tmp(struct brw_wm_compile
*c
)
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c
->tmp_index
== c
->tmp_max
) {
142 int r
= alloc_grf(c
);
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r
= 50; /* XXX random register! */
147 c
->tmp_regs
[ c
->tmp_max
++ ] = r
;
150 /* form the GRF register */
151 reg
= brw_vec8_grf(c
->tmp_regs
[ c
->tmp_index
++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg
.nr
< BRW_WM_MAX_GRF
);
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
162 static int mark_tmps(struct brw_wm_compile
*c
)
167 static struct brw_reg
lookup_tmp( struct brw_wm_compile
*c
, int index
)
169 return brw_vec8_grf( c
->tmp_regs
[ index
], 0 );
172 static void release_tmps(struct brw_wm_compile
*c
, int mark
)
178 * Convert Mesa src register to brw register.
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
190 static struct brw_reg
191 get_reg(struct brw_wm_compile
*c
, int file
, int index
, int component
,
192 int nr
, GLuint neg
, GLuint abs
)
196 case PROGRAM_STATE_VAR
:
197 case PROGRAM_CONSTANT
:
198 case PROGRAM_UNIFORM
:
199 file
= PROGRAM_STATE_VAR
;
201 case PROGRAM_UNDEFINED
:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY
:
206 case PROGRAM_PAYLOAD
:
209 _mesa_problem(NULL
, "Unexpected file in get_reg()");
210 return brw_null_reg();
214 assert(component
< 4);
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c
->wm_regs
[file
][index
][component
].inited
) {
219 reg
= c
->wm_regs
[file
][index
][component
].reg
;
222 /* no, allocate new register */
223 int grf
= alloc_grf(c
);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
226 /* totally out of temps */
227 grf
= 51; /* XXX random register! */
230 reg
= brw_vec8_grf(grf
, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
233 set_reg(c
, file
, index
, component
, reg
);
236 if (neg
& (1 << component
)) {
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
251 reclaim_temps(struct brw_wm_compile
*c
)
253 GLint intBegin
[MAX_PROGRAM_TEMPS
];
254 GLint intEnd
[MAX_PROGRAM_TEMPS
];
257 /*printf("Reclaim temps:\n");*/
259 _mesa_find_temp_intervals(c
->prog_instructions
, c
->nr_fp_insns
,
262 for (index
= 0; index
< MAX_PROGRAM_TEMPS
; index
++) {
263 if (intEnd
[index
] != -1 && intEnd
[index
] < c
->cur_inst
) {
264 /* program temp[i] can be freed */
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component
= 0; component
< 4; component
++) {
268 if (c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].inited
) {
269 int r
= c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].reg
.nr
;
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
275 c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].inited
= GL_FALSE
;
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
290 static void prealloc_reg(struct brw_wm_compile
*c
)
292 struct intel_context
*intel
= &c
->func
.brw
->intel
;
295 int urb_read_length
= 0;
296 GLuint inputs
= FRAG_BIT_WPOS
| c
->fp_interp_emitted
;
297 GLuint reg_index
= 0;
299 memset(c
->used_grf
, GL_FALSE
, sizeof(c
->used_grf
));
300 c
->first_free_grf
= 0;
302 for (i
= 0; i
< 4; i
++) {
303 if (i
< c
->key
.nr_depth_regs
)
304 reg
= brw_vec8_grf(i
* 2, 0);
306 reg
= brw_vec8_grf(0, 0);
307 set_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
, i
, reg
);
309 reg_index
+= 2 * c
->key
.nr_depth_regs
;
313 const GLuint nr_params
= c
->fp
->program
.Base
.Parameters
->NumParameters
;
314 const GLuint nr_temps
= c
->fp
->program
.Base
.NumTemporaries
;
316 /* use a real constant buffer, or just use a section of the GRF? */
317 /* XXX this heuristic may need adjustment... */
318 if ((nr_params
+ nr_temps
) * 4 + reg_index
> 80)
319 c
->fp
->use_const_buffer
= GL_TRUE
;
321 c
->fp
->use_const_buffer
= GL_FALSE
;
322 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
324 if (c
->fp
->use_const_buffer
) {
325 /* We'll use a real constant buffer and fetch constants from
326 * it with a dataport read message.
329 /* number of float constants in CURBE */
330 c
->prog_data
.nr_params
= 0;
333 const struct gl_program_parameter_list
*plist
=
334 c
->fp
->program
.Base
.Parameters
;
337 /* number of float constants in CURBE */
338 c
->prog_data
.nr_params
= 4 * nr_params
;
340 /* loop over program constants (float[4]) */
341 for (i
= 0; i
< nr_params
; i
++) {
342 /* loop over XYZW channels */
343 for (j
= 0; j
< 4; j
++, index
++) {
344 reg
= brw_vec1_grf(reg_index
+ index
/ 8, index
% 8);
345 /* Save pointer to parameter/constant value.
346 * Constants will be copied in prepare_constant_buffer()
348 c
->prog_data
.param
[index
] = &plist
->ParameterValues
[i
][j
];
349 set_reg(c
, PROGRAM_STATE_VAR
, i
, j
, reg
);
352 /* number of constant regs used (each reg is float[8]) */
353 c
->nr_creg
= 2 * ((4 * nr_params
+ 15) / 16);
354 reg_index
+= c
->nr_creg
;
358 /* fragment shader inputs */
359 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
362 if (i
>= VERT_RESULT_VAR0
)
363 fp_input
= i
- VERT_RESULT_VAR0
+ FRAG_ATTRIB_VAR0
;
364 else if (i
<= VERT_RESULT_TEX7
)
369 if (fp_input
>= 0 && inputs
& (1 << fp_input
)) {
370 urb_read_length
= reg_index
;
371 reg
= brw_vec8_grf(reg_index
, 0);
372 for (j
= 0; j
< 4; j
++)
373 set_reg(c
, PROGRAM_PAYLOAD
, fp_input
, j
, reg
);
375 if (c
->key
.vp_outputs_written
& BITFIELD64_BIT(i
)) {
380 c
->prog_data
.first_curbe_grf
= c
->key
.nr_depth_regs
* 2;
381 c
->prog_data
.urb_read_length
= urb_read_length
;
382 c
->prog_data
.curb_read_length
= c
->nr_creg
;
383 c
->emit_mask_reg
= brw_uw1_reg(BRW_GENERAL_REGISTER_FILE
, reg_index
, 0);
385 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg_index
, 0);
388 /* mark GRF regs [0..reg_index-1] as in-use */
389 for (i
= 0; i
< reg_index
; i
++)
392 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
393 prealloc_grf(c
, 126);
394 prealloc_grf(c
, 127);
396 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
397 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
398 struct brw_reg dst
[4];
400 switch (inst
->Opcode
) {
403 /* Allocate the channels of texture results contiguously,
404 * since they are written out that way by the sampler unit.
406 for (j
= 0; j
< 4; j
++) {
407 dst
[j
] = get_dst_reg(c
, inst
, j
);
409 assert(dst
[j
].nr
== dst
[j
- 1].nr
+ 1);
417 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
418 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
420 switch (inst
->Opcode
) {
422 /* Allocate WM_DELTAXY destination on G45/GM45 to an
423 * even-numbered GRF if possible so that we can use the PLN
426 if (inst
->DstReg
.WriteMask
== WRITEMASK_XY
&&
427 !c
->wm_regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
][0].inited
&&
428 !c
->wm_regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
][1].inited
&&
429 (IS_G4X(intel
->intelScreen
->deviceID
) || intel
->gen
== 5)) {
432 for (grf
= c
->first_free_grf
& ~1;
433 grf
< BRW_WM_MAX_GRF
;
436 if (!c
->used_grf
[grf
] && !c
->used_grf
[grf
+ 1]) {
437 c
->used_grf
[grf
] = GL_TRUE
;
438 c
->used_grf
[grf
+ 1] = GL_TRUE
;
439 c
->first_free_grf
= grf
+ 2; /* a guess */
441 set_reg(c
, inst
->DstReg
.File
, inst
->DstReg
.Index
, 0,
442 brw_vec8_grf(grf
, 0));
443 set_reg(c
, inst
->DstReg
.File
, inst
->DstReg
.Index
, 1,
444 brw_vec8_grf(grf
+ 1, 0));
454 /* An instruction may reference up to three constants.
455 * They'll be found in these registers.
456 * XXX alloc these on demand!
458 if (c
->fp
->use_const_buffer
) {
459 for (i
= 0; i
< 3; i
++) {
460 c
->current_const
[i
].index
= -1;
461 c
->current_const
[i
].reg
= brw_vec8_grf(alloc_grf(c
), 0);
465 printf("USE CONST BUFFER? %d\n", c
->fp
->use_const_buffer
);
466 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index
);
472 * Check if any of the instruction's src registers are constants, uniforms,
473 * or statevars. If so, fetch any constants that we don't already have in
474 * the three GRF slots.
476 static void fetch_constants(struct brw_wm_compile
*c
,
477 const struct prog_instruction
*inst
)
479 struct brw_compile
*p
= &c
->func
;
482 /* loop over instruction src regs */
483 for (i
= 0; i
< 3; i
++) {
484 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
485 if (src
->File
== PROGRAM_STATE_VAR
||
486 src
->File
== PROGRAM_CONSTANT
||
487 src
->File
== PROGRAM_UNIFORM
) {
488 c
->current_const
[i
].index
= src
->Index
;
491 printf(" fetch const[%d] for arg %d into reg %d\n",
492 src
->Index
, i
, c
->current_const
[i
].reg
.nr
);
495 /* need to fetch the constant now */
497 c
->current_const
[i
].reg
, /* writeback dest */
498 src
->RelAddr
, /* relative indexing? */
499 16 * src
->Index
, /* byte offset */
500 SURF_INDEX_FRAG_CONST_BUFFER
/* binding table index */
508 * Convert Mesa dst register to brw register.
510 static struct brw_reg
get_dst_reg(struct brw_wm_compile
*c
,
511 const struct prog_instruction
*inst
,
515 return get_reg(c
, inst
->DstReg
.File
, inst
->DstReg
.Index
, component
, nr
,
520 static struct brw_reg
521 get_src_reg_const(struct brw_wm_compile
*c
,
522 const struct prog_instruction
*inst
,
523 GLuint srcRegIndex
, GLuint component
)
525 /* We should have already fetched the constant from the constant
526 * buffer in fetch_constants(). Now we just have to return a
527 * register description that extracts the needed component and
528 * smears it across all eight vector components.
530 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
531 struct brw_reg const_reg
;
533 assert(component
< 4);
534 assert(srcRegIndex
< 3);
535 assert(c
->current_const
[srcRegIndex
].index
!= -1);
536 const_reg
= c
->current_const
[srcRegIndex
].reg
;
538 /* extract desired float from the const_reg, and smear */
539 const_reg
= stride(const_reg
, 0, 1, 0);
540 const_reg
.subnr
= component
* 4;
542 if (src
->Negate
& (1 << component
))
543 const_reg
= negate(const_reg
);
545 const_reg
= brw_abs(const_reg
);
548 printf(" form const[%d].%d for arg %d, reg %d\n",
549 c
->current_const
[srcRegIndex
].index
,
560 * Convert Mesa src register to brw register.
562 static struct brw_reg
get_src_reg(struct brw_wm_compile
*c
,
563 const struct prog_instruction
*inst
,
564 GLuint srcRegIndex
, GLuint channel
)
566 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
568 const GLuint component
= GET_SWZ(src
->Swizzle
, channel
);
570 /* Extended swizzle terms */
571 if (component
== SWIZZLE_ZERO
) {
572 return brw_imm_f(0.0F
);
574 else if (component
== SWIZZLE_ONE
) {
575 return brw_imm_f(1.0F
);
578 if (c
->fp
->use_const_buffer
&&
579 (src
->File
== PROGRAM_STATE_VAR
||
580 src
->File
== PROGRAM_CONSTANT
||
581 src
->File
== PROGRAM_UNIFORM
)) {
582 return get_src_reg_const(c
, inst
, srcRegIndex
, component
);
585 /* other type of source register */
586 return get_reg(c
, src
->File
, src
->Index
, component
, nr
,
587 src
->Negate
, src
->Abs
);
592 * Subroutines are minimal support for resusable instruction sequences.
593 * They are implemented as simply as possible to minimise overhead: there
594 * is no explicit support for communication between the caller and callee
595 * other than saving the return address in a temporary register, nor is
596 * there any automatic local storage. This implies that great care is
597 * required before attempting reentrancy or any kind of nested
598 * subroutine invocations.
600 static void invoke_subroutine( struct brw_wm_compile
*c
,
601 enum _subroutine subroutine
,
602 void (*emit
)( struct brw_wm_compile
* ) )
604 struct brw_compile
*p
= &c
->func
;
606 assert( subroutine
< BRW_WM_MAX_SUBROUTINE
);
608 if( c
->subroutines
[ subroutine
] ) {
609 /* subroutine previously emitted: reuse existing instructions */
611 int mark
= mark_tmps( c
);
612 struct brw_reg return_address
= retype( alloc_tmp( c
),
613 BRW_REGISTER_TYPE_UD
);
614 int here
= p
->nr_insn
;
616 brw_push_insn_state(p
);
617 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
618 brw_ADD( p
, return_address
, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
620 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(),
621 brw_imm_d( ( c
->subroutines
[ subroutine
] -
623 brw_pop_insn_state(p
);
625 release_tmps( c
, mark
);
627 /* previously unused subroutine: emit, and mark for later reuse */
629 int mark
= mark_tmps( c
);
630 struct brw_reg return_address
= retype( alloc_tmp( c
),
631 BRW_REGISTER_TYPE_UD
);
632 struct brw_instruction
*calc
;
633 int base
= p
->nr_insn
;
635 brw_push_insn_state(p
);
636 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
637 calc
= brw_ADD( p
, return_address
, brw_ip_reg(), brw_imm_ud( 0 ) );
638 brw_pop_insn_state(p
);
640 c
->subroutines
[ subroutine
] = p
->nr_insn
;
644 brw_push_insn_state(p
);
645 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
646 brw_MOV( p
, brw_ip_reg(), return_address
);
647 brw_pop_insn_state(p
);
649 brw_set_src1( calc
, brw_imm_ud( ( p
->nr_insn
- base
) << 4 ) );
651 release_tmps( c
, mark
);
655 static void emit_arl(struct brw_wm_compile
*c
,
656 const struct prog_instruction
*inst
)
658 struct brw_compile
*p
= &c
->func
;
659 struct brw_reg src0
, addr_reg
;
660 brw_set_saturate(p
, (inst
->SaturateMode
!= SATURATE_OFF
) ? 1 : 0);
661 addr_reg
= brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
663 src0
= get_src_reg(c
, inst
, 0, 0); /* channel 0 */
664 brw_MOV(p
, addr_reg
, src0
);
665 brw_set_saturate(p
, 0);
669 * For GLSL shaders, this KIL will be unconditional.
670 * It may be contained inside an IF/ENDIF structure of course.
672 static void emit_kil(struct brw_wm_compile
*c
)
674 struct brw_compile
*p
= &c
->func
;
675 struct brw_reg depth
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
676 brw_push_insn_state(p
);
677 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
678 brw_NOT(p
, c
->emit_mask_reg
, brw_mask_reg(1)); /* IMASK */
679 brw_AND(p
, depth
, c
->emit_mask_reg
, depth
);
680 brw_pop_insn_state(p
);
683 static INLINE
struct brw_reg
high_words( struct brw_reg reg
)
685 return stride( suboffset( retype( reg
, BRW_REGISTER_TYPE_W
), 1 ),
689 static INLINE
struct brw_reg
low_words( struct brw_reg reg
)
691 return stride( retype( reg
, BRW_REGISTER_TYPE_W
), 0, 8, 2 );
694 static INLINE
struct brw_reg
even_bytes( struct brw_reg reg
)
696 return stride( retype( reg
, BRW_REGISTER_TYPE_B
), 0, 16, 2 );
699 static INLINE
struct brw_reg
odd_bytes( struct brw_reg reg
)
701 return stride( suboffset( retype( reg
, BRW_REGISTER_TYPE_B
), 1 ),
705 /* One-, two- and three-dimensional Perlin noise, similar to the description
706 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
707 static void noise1_sub( struct brw_wm_compile
*c
) {
709 struct brw_compile
*p
= &c
->func
;
710 struct brw_reg param
,
711 x0
, x1
, /* gradients at each end */
712 t
, tmp
[ 2 ], /* float temporaries */
713 itmp
[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
715 int mark
= mark_tmps( c
);
720 tmp
[ 0 ] = alloc_tmp( c
);
721 tmp
[ 1 ] = alloc_tmp( c
);
722 itmp
[ 0 ] = retype( tmp
[ 0 ], BRW_REGISTER_TYPE_UD
);
723 itmp
[ 1 ] = retype( tmp
[ 1 ], BRW_REGISTER_TYPE_UD
);
724 itmp
[ 2 ] = retype( x0
, BRW_REGISTER_TYPE_UD
);
725 itmp
[ 3 ] = retype( x1
, BRW_REGISTER_TYPE_UD
);
726 itmp
[ 4 ] = retype( t
, BRW_REGISTER_TYPE_UD
);
728 param
= lookup_tmp( c
, mark
- 2 );
730 brw_set_access_mode( p
, BRW_ALIGN_1
);
732 brw_MOV( p
, itmp
[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
734 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
735 be hashed. Also compute the remainder (offset within the unit
736 length), interleaved to reduce register dependency penalties. */
737 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param
);
738 brw_FRC( p
, param
, param
);
739 brw_ADD( p
, itmp
[ 1 ], itmp
[ 0 ], brw_imm_ud( 1 ) );
740 brw_MOV( p
, itmp
[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
741 brw_MOV( p
, itmp
[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
743 /* We're now ready to perform the hashing. The two hashes are
744 interleaved for performance. The hash function used is
745 designed to rapidly achieve avalanche and require only 32x16
746 bit multiplication, and 16-bit swizzles (which we get for
747 free). We can't use immediate operands in the multiplies,
748 because immediates are permitted only in src1 and the 16-bit
749 factor is permitted only in src0. */
750 for( i
= 0; i
< 2; i
++ )
751 brw_MUL( p
, itmp
[ i
], itmp
[ 2 ], itmp
[ i
] );
752 for( i
= 0; i
< 2; i
++ )
753 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
754 high_words( itmp
[ i
] ) );
755 for( i
= 0; i
< 2; i
++ )
756 brw_MUL( p
, itmp
[ i
], itmp
[ 3 ], itmp
[ i
] );
757 for( i
= 0; i
< 2; i
++ )
758 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
759 high_words( itmp
[ i
] ) );
760 for( i
= 0; i
< 2; i
++ )
761 brw_MUL( p
, itmp
[ i
], itmp
[ 4 ], itmp
[ i
] );
762 for( i
= 0; i
< 2; i
++ )
763 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
764 high_words( itmp
[ i
] ) );
766 /* Now we want to initialise the two gradients based on the
767 hashes. Format conversion from signed integer to float leaves
768 everything scaled too high by a factor of pow( 2, 31 ), but
769 we correct for that right at the end. */
770 brw_ADD( p
, t
, param
, brw_imm_f( -1.0 ) );
771 brw_MOV( p
, x0
, retype( tmp
[ 0 ], BRW_REGISTER_TYPE_D
) );
772 brw_MOV( p
, x1
, retype( tmp
[ 1 ], BRW_REGISTER_TYPE_D
) );
774 brw_MUL( p
, x0
, x0
, param
);
775 brw_MUL( p
, x1
, x1
, t
);
777 /* We interpolate between the gradients using the polynomial
778 6t^5 - 15t^4 + 10t^3 (Perlin). */
779 brw_MUL( p
, tmp
[ 0 ], param
, brw_imm_f( 6.0 ) );
780 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( -15.0 ) );
781 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
782 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( 10.0 ) );
783 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
784 brw_ADD( p
, x1
, x1
, negate( x0
) ); /* unrelated work to fill the
786 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
787 brw_MUL( p
, param
, tmp
[ 0 ], param
);
788 brw_MUL( p
, x1
, x1
, param
);
789 brw_ADD( p
, x0
, x0
, x1
);
790 /* scale by pow( 2, -30 ), to compensate for the format conversion
791 above and an extra factor of 2 so that a single gradient covers
793 brw_MUL( p
, param
, x0
, brw_imm_f( 0.000000000931322574615478515625 ) );
795 release_tmps( c
, mark
);
798 static void emit_noise1( struct brw_wm_compile
*c
,
799 const struct prog_instruction
*inst
)
801 struct brw_compile
*p
= &c
->func
;
802 struct brw_reg src
, param
, dst
;
803 GLuint mask
= inst
->DstReg
.WriteMask
;
805 int mark
= mark_tmps( c
);
809 src
= get_src_reg( c
, inst
, 0, 0 );
811 param
= alloc_tmp( c
);
813 brw_MOV( p
, param
, src
);
815 invoke_subroutine( c
, SUB_NOISE1
, noise1_sub
);
817 /* Fill in the result: */
818 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
819 for (i
= 0 ; i
< 4; i
++) {
821 dst
= get_dst_reg(c
, inst
, i
);
822 brw_MOV( p
, dst
, param
);
825 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
826 brw_set_saturate( p
, 0 );
828 release_tmps( c
, mark
);
831 static void noise2_sub( struct brw_wm_compile
*c
) {
833 struct brw_compile
*p
= &c
->func
;
834 struct brw_reg param0
, param1
,
835 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at each corner */
836 t
, tmp
[ 4 ], /* float temporaries */
837 itmp
[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
839 int mark
= mark_tmps( c
);
841 x0y0
= alloc_tmp( c
);
842 x0y1
= alloc_tmp( c
);
843 x1y0
= alloc_tmp( c
);
844 x1y1
= alloc_tmp( c
);
846 for( i
= 0; i
< 4; i
++ ) {
847 tmp
[ i
] = alloc_tmp( c
);
848 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
850 itmp
[ 4 ] = retype( x0y0
, BRW_REGISTER_TYPE_UD
);
851 itmp
[ 5 ] = retype( x0y1
, BRW_REGISTER_TYPE_UD
);
852 itmp
[ 6 ] = retype( x1y0
, BRW_REGISTER_TYPE_UD
);
854 param0
= lookup_tmp( c
, mark
- 3 );
855 param1
= lookup_tmp( c
, mark
- 2 );
857 brw_set_access_mode( p
, BRW_ALIGN_1
);
859 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
860 be hashed. Also compute the remainders (offsets within the unit
861 square), interleaved to reduce register dependency penalties. */
862 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param0
);
863 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param1
);
864 brw_FRC( p
, param0
, param0
);
865 brw_FRC( p
, param1
, param1
);
866 brw_MOV( p
, itmp
[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
867 brw_ADD( p
, high_words( itmp
[ 0 ] ), high_words( itmp
[ 0 ] ),
868 low_words( itmp
[ 1 ] ) );
869 brw_MOV( p
, itmp
[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
870 brw_MOV( p
, itmp
[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
871 brw_ADD( p
, itmp
[ 1 ], itmp
[ 0 ], brw_imm_ud( 0x10000 ) );
872 brw_ADD( p
, itmp
[ 2 ], itmp
[ 0 ], brw_imm_ud( 0x1 ) );
873 brw_ADD( p
, itmp
[ 3 ], itmp
[ 0 ], brw_imm_ud( 0x10001 ) );
875 /* We're now ready to perform the hashing. The four hashes are
876 interleaved for performance. The hash function used is
877 designed to rapidly achieve avalanche and require only 32x16
878 bit multiplication, and 16-bit swizzles (which we get for
879 free). We can't use immediate operands in the multiplies,
880 because immediates are permitted only in src1 and the 16-bit
881 factor is permitted only in src0. */
882 for( i
= 0; i
< 4; i
++ )
883 brw_MUL( p
, itmp
[ i
], itmp
[ 4 ], itmp
[ i
] );
884 for( i
= 0; i
< 4; i
++ )
885 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
886 high_words( itmp
[ i
] ) );
887 for( i
= 0; i
< 4; i
++ )
888 brw_MUL( p
, itmp
[ i
], itmp
[ 5 ], itmp
[ i
] );
889 for( i
= 0; i
< 4; i
++ )
890 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
891 high_words( itmp
[ i
] ) );
892 for( i
= 0; i
< 4; i
++ )
893 brw_MUL( p
, itmp
[ i
], itmp
[ 6 ], itmp
[ i
] );
894 for( i
= 0; i
< 4; i
++ )
895 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
896 high_words( itmp
[ i
] ) );
898 /* Now we want to initialise the four gradients based on the
899 hashes. Format conversion from signed integer to float leaves
900 everything scaled too high by a factor of pow( 2, 15 ), but
901 we correct for that right at the end. */
902 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
903 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
904 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
905 brw_MOV( p
, x1y0
, low_words( tmp
[ 2 ] ) );
906 brw_MOV( p
, x1y1
, low_words( tmp
[ 3 ] ) );
908 brw_MOV( p
, tmp
[ 0 ], high_words( tmp
[ 0 ] ) );
909 brw_MOV( p
, tmp
[ 1 ], high_words( tmp
[ 1 ] ) );
910 brw_MOV( p
, tmp
[ 2 ], high_words( tmp
[ 2 ] ) );
911 brw_MOV( p
, tmp
[ 3 ], high_words( tmp
[ 3 ] ) );
913 brw_MUL( p
, x1y0
, x1y0
, t
);
914 brw_MUL( p
, x1y1
, x1y1
, t
);
915 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
916 brw_MUL( p
, x0y0
, x0y0
, param0
);
917 brw_MUL( p
, x0y1
, x0y1
, param0
);
919 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param1
);
920 brw_MUL( p
, tmp
[ 2 ], tmp
[ 2 ], param1
);
921 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], t
);
922 brw_MUL( p
, tmp
[ 3 ], tmp
[ 3 ], t
);
924 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 0 ] );
925 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 2 ] );
926 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 1 ] );
927 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 3 ] );
929 /* We interpolate between the gradients using the polynomial
930 6t^5 - 15t^4 + 10t^3 (Perlin). */
931 brw_MUL( p
, tmp
[ 0 ], param0
, brw_imm_f( 6.0 ) );
932 brw_MUL( p
, tmp
[ 1 ], param1
, brw_imm_f( 6.0 ) );
933 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( -15.0 ) );
934 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], brw_imm_f( -15.0 ) );
935 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
936 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
937 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) ); /* unrelated work to fill the
939 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( 10.0 ) );
940 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], brw_imm_f( 10.0 ) );
941 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
942 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
943 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) ); /* unrelated work to fill the
945 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
946 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
947 brw_MUL( p
, param0
, tmp
[ 0 ], param0
);
948 brw_MUL( p
, param1
, tmp
[ 1 ], param1
);
950 /* Here we interpolate in the y dimension... */
951 brw_MUL( p
, x0y1
, x0y1
, param1
);
952 brw_MUL( p
, x1y1
, x1y1
, param1
);
953 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
954 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
956 /* And now in x. There are horrible register dependencies here,
957 but we have nothing else to do. */
958 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
959 brw_MUL( p
, x1y0
, x1y0
, param0
);
960 brw_ADD( p
, x0y0
, x0y0
, x1y0
);
962 /* scale by pow( 2, -15 ), as described above */
963 brw_MUL( p
, param0
, x0y0
, brw_imm_f( 0.000030517578125 ) );
965 release_tmps( c
, mark
);
968 static void emit_noise2( struct brw_wm_compile
*c
,
969 const struct prog_instruction
*inst
)
971 struct brw_compile
*p
= &c
->func
;
972 struct brw_reg src0
, src1
, param0
, param1
, dst
;
973 GLuint mask
= inst
->DstReg
.WriteMask
;
975 int mark
= mark_tmps( c
);
979 src0
= get_src_reg( c
, inst
, 0, 0 );
980 src1
= get_src_reg( c
, inst
, 0, 1 );
982 param0
= alloc_tmp( c
);
983 param1
= alloc_tmp( c
);
985 brw_MOV( p
, param0
, src0
);
986 brw_MOV( p
, param1
, src1
);
988 invoke_subroutine( c
, SUB_NOISE2
, noise2_sub
);
990 /* Fill in the result: */
991 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
992 for (i
= 0 ; i
< 4; i
++) {
994 dst
= get_dst_reg(c
, inst
, i
);
995 brw_MOV( p
, dst
, param0
);
998 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
999 brw_set_saturate( p
, 0 );
1001 release_tmps( c
, mark
);
1005 * The three-dimensional case is much like the one- and two- versions above,
1006 * but since the number of corners is rapidly growing we now pack 16 16-bit
1007 * hashes into each register to extract more parallelism from the EUs.
1009 static void noise3_sub( struct brw_wm_compile
*c
) {
1011 struct brw_compile
*p
= &c
->func
;
1012 struct brw_reg param0
, param1
, param2
,
1013 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at four of the corners */
1014 xi
, yi
, zi
, /* interpolation coefficients */
1015 t
, tmp
[ 8 ], /* float temporaries */
1016 itmp
[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1017 wtmp
[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1019 int mark
= mark_tmps( c
);
1021 x0y0
= alloc_tmp( c
);
1022 x0y1
= alloc_tmp( c
);
1023 x1y0
= alloc_tmp( c
);
1024 x1y1
= alloc_tmp( c
);
1025 xi
= alloc_tmp( c
);
1026 yi
= alloc_tmp( c
);
1027 zi
= alloc_tmp( c
);
1029 for( i
= 0; i
< 8; i
++ ) {
1030 tmp
[ i
] = alloc_tmp( c
);
1031 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1032 wtmp
[ i
] = brw_uw16_grf( tmp
[ i
].nr
, 0 );
1035 param0
= lookup_tmp( c
, mark
- 4 );
1036 param1
= lookup_tmp( c
, mark
- 3 );
1037 param2
= lookup_tmp( c
, mark
- 2 );
1039 brw_set_access_mode( p
, BRW_ALIGN_1
);
1041 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1042 be hashed. Also compute the remainders (offsets within the unit
1043 cube), interleaved to reduce register dependency penalties. */
1044 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param0
);
1045 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param1
);
1046 brw_RNDD( p
, retype( itmp
[ 2 ], BRW_REGISTER_TYPE_D
), param2
);
1047 brw_FRC( p
, param0
, param0
);
1048 brw_FRC( p
, param1
, param1
);
1049 brw_FRC( p
, param2
, param2
);
1050 /* Since we now have only 16 bits of precision in the hash, we must
1051 be more careful about thorough mixing to maintain entropy as we
1052 squash the input vector into a small scalar. */
1053 brw_MUL( p
, brw_null_reg(), low_words( itmp
[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1054 brw_MAC( p
, brw_null_reg(), low_words( itmp
[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1055 brw_MAC( p
, low_words( itmp
[ 0 ] ), low_words( itmp
[ 2 ] ),
1056 brw_imm_uw( 0x9B93 ) );
1057 brw_ADD( p
, high_words( itmp
[ 0 ] ), low_words( itmp
[ 0 ] ),
1058 brw_imm_uw( 0xBC8F ) );
1060 /* Temporarily disable the execution mask while we work with ExecSize=16
1061 channels (the mask is set for ExecSize=8 and is probably incorrect).
1062 Although this might cause execution of unwanted channels, the code
1063 writes only to temporary registers and has no side effects, so
1064 disabling the mask is harmless. */
1065 brw_push_insn_state( p
);
1066 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1067 brw_ADD( p
, wtmp
[ 1 ], wtmp
[ 0 ], brw_imm_uw( 0xD0BD ) );
1068 brw_ADD( p
, wtmp
[ 2 ], wtmp
[ 0 ], brw_imm_uw( 0x9B93 ) );
1069 brw_ADD( p
, wtmp
[ 3 ], wtmp
[ 1 ], brw_imm_uw( 0x9B93 ) );
1071 /* We're now ready to perform the hashing. The eight hashes are
1072 interleaved for performance. The hash function used is
1073 designed to rapidly achieve avalanche and require only 16x16
1074 bit multiplication, and 8-bit swizzles (which we get for
1076 for( i
= 0; i
< 4; i
++ )
1077 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0x28D9 ) );
1078 for( i
= 0; i
< 4; i
++ )
1079 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1080 odd_bytes( wtmp
[ i
] ) );
1081 for( i
= 0; i
< 4; i
++ )
1082 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0xC6D5 ) );
1083 for( i
= 0; i
< 4; i
++ )
1084 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1085 odd_bytes( wtmp
[ i
] ) );
1086 brw_pop_insn_state( p
);
1088 /* Now we want to initialise the four rear gradients based on the
1089 hashes. Format conversion from signed integer to float leaves
1090 everything scaled too high by a factor of pow( 2, 15 ), but
1091 we correct for that right at the end. */
1093 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1094 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1095 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1096 brw_MOV( p
, x1y0
, high_words( tmp
[ 0 ] ) );
1097 brw_MOV( p
, x1y1
, high_words( tmp
[ 1 ] ) );
1099 brw_push_insn_state( p
);
1100 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1101 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 5 ) );
1102 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 5 ) );
1103 brw_pop_insn_state( p
);
1105 brw_MUL( p
, x1y0
, x1y0
, t
);
1106 brw_MUL( p
, x1y1
, x1y1
, t
);
1107 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1108 brw_MUL( p
, x0y0
, x0y0
, param0
);
1109 brw_MUL( p
, x0y1
, x0y1
, param0
);
1112 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1113 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1114 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1115 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1117 brw_push_insn_state( p
);
1118 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1119 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 5 ) );
1120 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 5 ) );
1121 brw_pop_insn_state( p
);
1123 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1124 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1125 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1126 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param1
);
1127 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param1
);
1129 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1130 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1131 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1132 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1135 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1136 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1137 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1138 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1140 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param2
);
1141 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], param2
);
1142 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param2
);
1143 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], param2
);
1145 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1146 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1147 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1148 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1150 /* We interpolate between the gradients using the polynomial
1151 6t^5 - 15t^4 + 10t^3 (Perlin). */
1152 brw_MUL( p
, xi
, param0
, brw_imm_f( 6.0 ) );
1153 brw_MUL( p
, yi
, param1
, brw_imm_f( 6.0 ) );
1154 brw_MUL( p
, zi
, param2
, brw_imm_f( 6.0 ) );
1155 brw_ADD( p
, xi
, xi
, brw_imm_f( -15.0 ) );
1156 brw_ADD( p
, yi
, yi
, brw_imm_f( -15.0 ) );
1157 brw_ADD( p
, zi
, zi
, brw_imm_f( -15.0 ) );
1158 brw_MUL( p
, xi
, xi
, param0
);
1159 brw_MUL( p
, yi
, yi
, param1
);
1160 brw_MUL( p
, zi
, zi
, param2
);
1161 brw_ADD( p
, xi
, xi
, brw_imm_f( 10.0 ) );
1162 brw_ADD( p
, yi
, yi
, brw_imm_f( 10.0 ) );
1163 brw_ADD( p
, zi
, zi
, brw_imm_f( 10.0 ) );
1164 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) ); /* unrelated work */
1165 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) ); /* unrelated work */
1166 brw_MUL( p
, xi
, xi
, param0
);
1167 brw_MUL( p
, yi
, yi
, param1
);
1168 brw_MUL( p
, zi
, zi
, param2
);
1169 brw_MUL( p
, xi
, xi
, param0
);
1170 brw_MUL( p
, yi
, yi
, param1
);
1171 brw_MUL( p
, zi
, zi
, param2
);
1172 brw_MUL( p
, xi
, xi
, param0
);
1173 brw_MUL( p
, yi
, yi
, param1
);
1174 brw_MUL( p
, zi
, zi
, param2
);
1176 /* Here we interpolate in the y dimension... */
1177 brw_MUL( p
, x0y1
, x0y1
, yi
);
1178 brw_MUL( p
, x1y1
, x1y1
, yi
);
1179 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1180 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1182 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1183 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1184 brw_MUL( p
, x1y0
, x1y0
, xi
);
1185 brw_ADD( p
, tmp
[ 0 ], x0y0
, x1y0
);
1187 /* Now do the same thing for the front four gradients... */
1189 brw_MOV( p
, x0y0
, low_words( tmp
[ 2 ] ) );
1190 brw_MOV( p
, x0y1
, low_words( tmp
[ 3 ] ) );
1191 brw_MOV( p
, x1y0
, high_words( tmp
[ 2 ] ) );
1192 brw_MOV( p
, x1y1
, high_words( tmp
[ 3 ] ) );
1194 brw_push_insn_state( p
);
1195 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1196 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 5 ) );
1197 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 5 ) );
1198 brw_pop_insn_state( p
);
1200 brw_MUL( p
, x1y0
, x1y0
, t
);
1201 brw_MUL( p
, x1y1
, x1y1
, t
);
1202 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1203 brw_MUL( p
, x0y0
, x0y0
, param0
);
1204 brw_MUL( p
, x0y1
, x0y1
, param0
);
1207 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1208 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1209 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1210 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1212 brw_push_insn_state( p
);
1213 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1214 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 5 ) );
1215 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 5 ) );
1216 brw_pop_insn_state( p
);
1218 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1219 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1220 brw_ADD( p
, t
, param2
, brw_imm_f( -1.0 ) );
1221 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param1
);
1222 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param1
);
1224 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1225 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1226 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1227 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1230 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1231 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1232 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1233 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1235 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1236 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1237 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1238 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1240 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1241 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1242 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1243 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1245 /* The interpolation coefficients are still around from last time, so
1246 again interpolate in the y dimension... */
1247 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1248 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1249 brw_MUL( p
, x0y1
, x0y1
, yi
);
1250 brw_MUL( p
, x1y1
, x1y1
, yi
);
1251 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1252 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1254 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1255 time put the front face in tmp[ 1 ] and we're nearly there... */
1256 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1257 brw_MUL( p
, x1y0
, x1y0
, xi
);
1258 brw_ADD( p
, tmp
[ 1 ], x0y0
, x1y0
);
1260 /* The final interpolation, in the z dimension: */
1261 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], negate( tmp
[ 0 ] ) );
1262 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], zi
);
1263 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], tmp
[ 1 ] );
1265 /* scale by pow( 2, -15 ), as described above */
1266 brw_MUL( p
, param0
, tmp
[ 0 ], brw_imm_f( 0.000030517578125 ) );
1268 release_tmps( c
, mark
);
1271 static void emit_noise3( struct brw_wm_compile
*c
,
1272 const struct prog_instruction
*inst
)
1274 struct brw_compile
*p
= &c
->func
;
1275 struct brw_reg src0
, src1
, src2
, param0
, param1
, param2
, dst
;
1276 GLuint mask
= inst
->DstReg
.WriteMask
;
1278 int mark
= mark_tmps( c
);
1280 assert( mark
== 0 );
1282 src0
= get_src_reg( c
, inst
, 0, 0 );
1283 src1
= get_src_reg( c
, inst
, 0, 1 );
1284 src2
= get_src_reg( c
, inst
, 0, 2 );
1286 param0
= alloc_tmp( c
);
1287 param1
= alloc_tmp( c
);
1288 param2
= alloc_tmp( c
);
1290 brw_MOV( p
, param0
, src0
);
1291 brw_MOV( p
, param1
, src1
);
1292 brw_MOV( p
, param2
, src2
);
1294 invoke_subroutine( c
, SUB_NOISE3
, noise3_sub
);
1296 /* Fill in the result: */
1297 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1298 for (i
= 0 ; i
< 4; i
++) {
1299 if (mask
& (1<<i
)) {
1300 dst
= get_dst_reg(c
, inst
, i
);
1301 brw_MOV( p
, dst
, param0
);
1304 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1305 brw_set_saturate( p
, 0 );
1307 release_tmps( c
, mark
);
1311 * For the four-dimensional case, the little micro-optimisation benefits
1312 * we obtain by unrolling all the loops aren't worth the massive bloat it
1313 * now causes. Instead, we loop twice around performing a similar operation
1314 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1315 * code to glue it all together.
1317 static void noise4_sub( struct brw_wm_compile
*c
)
1319 struct brw_compile
*p
= &c
->func
;
1320 struct brw_reg param
[ 4 ],
1321 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at four of the corners */
1322 w0
, /* noise for the w=0 cube */
1323 floors
[ 2 ], /* integer coordinates of base corner of hypercube */
1324 interp
[ 4 ], /* interpolation coefficients */
1325 t
, tmp
[ 8 ], /* float temporaries */
1326 itmp
[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1327 wtmp
[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1329 int mark
= mark_tmps( c
);
1330 GLuint loop
, origin
;
1332 x0y0
= alloc_tmp( c
);
1333 x0y1
= alloc_tmp( c
);
1334 x1y0
= alloc_tmp( c
);
1335 x1y1
= alloc_tmp( c
);
1337 w0
= alloc_tmp( c
);
1338 floors
[ 0 ] = retype( alloc_tmp( c
), BRW_REGISTER_TYPE_UD
);
1339 floors
[ 1 ] = retype( alloc_tmp( c
), BRW_REGISTER_TYPE_UD
);
1341 for( i
= 0; i
< 4; i
++ ) {
1342 param
[ i
] = lookup_tmp( c
, mark
- 5 + i
);
1343 interp
[ i
] = alloc_tmp( c
);
1346 for( i
= 0; i
< 8; i
++ ) {
1347 tmp
[ i
] = alloc_tmp( c
);
1348 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1349 wtmp
[ i
] = brw_uw16_grf( tmp
[ i
].nr
, 0 );
1352 brw_set_access_mode( p
, BRW_ALIGN_1
);
1354 /* We only want 16 bits of precision from the integral part of each
1355 co-ordinate, but unfortunately the RNDD semantics would saturate
1356 at 16 bits if we performed the operation directly to a 16-bit
1357 destination. Therefore, we round to 32-bit temporaries where
1358 appropriate, and then store only the lower 16 bits. */
1359 brw_RNDD( p
, retype( floors
[ 0 ], BRW_REGISTER_TYPE_D
), param
[ 0 ] );
1360 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param
[ 1 ] );
1361 brw_RNDD( p
, retype( floors
[ 1 ], BRW_REGISTER_TYPE_D
), param
[ 2 ] );
1362 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param
[ 3 ] );
1363 brw_MOV( p
, high_words( floors
[ 0 ] ), low_words( itmp
[ 0 ] ) );
1364 brw_MOV( p
, high_words( floors
[ 1 ] ), low_words( itmp
[ 1 ] ) );
1366 /* Modify the flag register here, because the side effect is useful
1367 later (see below). We know for certain that all flags will be
1368 cleared, since the FRC instruction cannot possibly generate
1369 negative results. Even for exceptional inputs (infinities, denormals,
1370 NaNs), the architecture guarantees that the L conditional is false. */
1371 brw_set_conditionalmod( p
, BRW_CONDITIONAL_L
);
1372 brw_FRC( p
, param
[ 0 ], param
[ 0 ] );
1373 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1374 for( i
= 1; i
< 4; i
++ )
1375 brw_FRC( p
, param
[ i
], param
[ i
] );
1377 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1379 for( i
= 0; i
< 4; i
++ )
1380 brw_MUL( p
, interp
[ i
], param
[ i
], brw_imm_f( 6.0 ) );
1381 for( i
= 0; i
< 4; i
++ )
1382 brw_ADD( p
, interp
[ i
], interp
[ i
], brw_imm_f( -15.0 ) );
1383 for( i
= 0; i
< 4; i
++ )
1384 brw_MUL( p
, interp
[ i
], interp
[ i
], param
[ i
] );
1385 for( i
= 0; i
< 4; i
++ )
1386 brw_ADD( p
, interp
[ i
], interp
[ i
], brw_imm_f( 10.0 ) );
1387 for( j
= 0; j
< 3; j
++ )
1388 for( i
= 0; i
< 4; i
++ )
1389 brw_MUL( p
, interp
[ i
], interp
[ i
], param
[ i
] );
1391 /* Mark the current address, as it will be a jump destination. The
1392 following code will be executed twice: first, with the flag
1393 register clear indicating the w=0 case, and second with flags
1397 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1398 be hashed. Since we have only 16 bits of precision in the hash, we
1399 must be careful about thorough mixing to maintain entropy as we
1400 squash the input vector into a small scalar. */
1401 brw_MUL( p
, brw_null_reg(), low_words( floors
[ 0 ] ),
1402 brw_imm_uw( 0xBC8F ) );
1403 brw_MAC( p
, brw_null_reg(), high_words( floors
[ 0 ] ),
1404 brw_imm_uw( 0xD0BD ) );
1405 brw_MAC( p
, brw_null_reg(), low_words( floors
[ 1 ] ),
1406 brw_imm_uw( 0x9B93 ) );
1407 brw_MAC( p
, low_words( itmp
[ 0 ] ), high_words( floors
[ 1 ] ),
1408 brw_imm_uw( 0xA359 ) );
1409 brw_ADD( p
, high_words( itmp
[ 0 ] ), low_words( itmp
[ 0 ] ),
1410 brw_imm_uw( 0xBC8F ) );
1412 /* Temporarily disable the execution mask while we work with ExecSize=16
1413 channels (the mask is set for ExecSize=8 and is probably incorrect).
1414 Although this might cause execution of unwanted channels, the code
1415 writes only to temporary registers and has no side effects, so
1416 disabling the mask is harmless. */
1417 brw_push_insn_state( p
);
1418 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1419 brw_ADD( p
, wtmp
[ 1 ], wtmp
[ 0 ], brw_imm_uw( 0xD0BD ) );
1420 brw_ADD( p
, wtmp
[ 2 ], wtmp
[ 0 ], brw_imm_uw( 0x9B93 ) );
1421 brw_ADD( p
, wtmp
[ 3 ], wtmp
[ 1 ], brw_imm_uw( 0x9B93 ) );
1423 /* We're now ready to perform the hashing. The eight hashes are
1424 interleaved for performance. The hash function used is
1425 designed to rapidly achieve avalanche and require only 16x16
1426 bit multiplication, and 8-bit swizzles (which we get for
1428 for( i
= 0; i
< 4; i
++ )
1429 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0x28D9 ) );
1430 for( i
= 0; i
< 4; i
++ )
1431 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1432 odd_bytes( wtmp
[ i
] ) );
1433 for( i
= 0; i
< 4; i
++ )
1434 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0xC6D5 ) );
1435 for( i
= 0; i
< 4; i
++ )
1436 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1437 odd_bytes( wtmp
[ i
] ) );
1438 brw_pop_insn_state( p
);
1440 /* Now we want to initialise the four rear gradients based on the
1441 hashes. Format conversion from signed integer to float leaves
1442 everything scaled too high by a factor of pow( 2, 15 ), but
1443 we correct for that right at the end. */
1445 brw_ADD( p
, t
, param
[ 0 ], brw_imm_f( -1.0 ) );
1446 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1447 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1448 brw_MOV( p
, x1y0
, high_words( tmp
[ 0 ] ) );
1449 brw_MOV( p
, x1y1
, high_words( tmp
[ 1 ] ) );
1451 brw_push_insn_state( p
);
1452 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1453 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1454 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1455 brw_pop_insn_state( p
);
1457 brw_MUL( p
, x1y0
, x1y0
, t
);
1458 brw_MUL( p
, x1y1
, x1y1
, t
);
1459 brw_ADD( p
, t
, param
[ 1 ], brw_imm_f( -1.0 ) );
1460 brw_MUL( p
, x0y0
, x0y0
, param
[ 0 ] );
1461 brw_MUL( p
, x0y1
, x0y1
, param
[ 0 ] );
1464 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1465 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1466 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1467 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1469 brw_push_insn_state( p
);
1470 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1471 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1472 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1473 brw_pop_insn_state( p
);
1475 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1476 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1477 /* prepare t for the w component (used below): w the first time through
1478 the loop; w - 1 the second time) */
1479 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1480 brw_ADD( p
, t
, param
[ 3 ], brw_imm_f( -1.0 ) );
1481 p
->current
->header
.predicate_inverse
= 1;
1482 brw_MOV( p
, t
, param
[ 3 ] );
1483 p
->current
->header
.predicate_inverse
= 0;
1484 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1485 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 1 ] );
1486 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 1 ] );
1488 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1489 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1490 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1491 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1494 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1495 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1496 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1497 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1499 brw_push_insn_state( p
);
1500 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1501 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1502 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1503 brw_pop_insn_state( p
);
1505 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 2 ] );
1506 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], param
[ 2 ] );
1507 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 2 ] );
1508 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], param
[ 2 ] );
1510 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1511 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1512 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1513 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1516 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1517 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1518 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1519 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1521 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1522 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1523 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1524 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1525 brw_ADD( p
, t
, param
[ 0 ], brw_imm_f( -1.0 ) );
1527 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1528 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1529 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1530 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1532 /* Here we interpolate in the y dimension... */
1533 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1534 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1535 brw_MUL( p
, x0y1
, x0y1
, interp
[ 1 ] );
1536 brw_MUL( p
, x1y1
, x1y1
, interp
[ 1 ] );
1537 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1538 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1540 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1541 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1542 brw_MUL( p
, x1y0
, x1y0
, interp
[ 0 ] );
1543 brw_ADD( p
, tmp
[ 0 ], x0y0
, x1y0
);
1545 /* Now do the same thing for the front four gradients... */
1547 brw_MOV( p
, x0y0
, low_words( tmp
[ 2 ] ) );
1548 brw_MOV( p
, x0y1
, low_words( tmp
[ 3 ] ) );
1549 brw_MOV( p
, x1y0
, high_words( tmp
[ 2 ] ) );
1550 brw_MOV( p
, x1y1
, high_words( tmp
[ 3 ] ) );
1552 brw_push_insn_state( p
);
1553 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1554 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1555 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1556 brw_pop_insn_state( p
);
1558 brw_MUL( p
, x1y0
, x1y0
, t
);
1559 brw_MUL( p
, x1y1
, x1y1
, t
);
1560 brw_ADD( p
, t
, param
[ 1 ], brw_imm_f( -1.0 ) );
1561 brw_MUL( p
, x0y0
, x0y0
, param
[ 0 ] );
1562 brw_MUL( p
, x0y1
, x0y1
, param
[ 0 ] );
1565 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1566 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1567 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1568 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1570 brw_push_insn_state( p
);
1571 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1572 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1573 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1574 brw_pop_insn_state( p
);
1576 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1577 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1578 brw_ADD( p
, t
, param
[ 2 ], brw_imm_f( -1.0 ) );
1579 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 1 ] );
1580 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 1 ] );
1582 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1583 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1584 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1585 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1588 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1589 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1590 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1591 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1593 brw_push_insn_state( p
);
1594 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1595 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
1596 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
1597 brw_pop_insn_state( p
);
1599 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1600 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1601 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1602 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1603 /* prepare t for the w component (used below): w the first time through
1604 the loop; w - 1 the second time) */
1605 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1606 brw_ADD( p
, t
, param
[ 3 ], brw_imm_f( -1.0 ) );
1607 p
->current
->header
.predicate_inverse
= 1;
1608 brw_MOV( p
, t
, param
[ 3 ] );
1609 p
->current
->header
.predicate_inverse
= 0;
1610 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1612 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1613 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1614 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1615 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1618 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1619 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1620 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1621 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1623 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1624 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1625 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1626 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1628 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1629 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1630 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1631 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1633 /* Interpolate in the y dimension: */
1634 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1635 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1636 brw_MUL( p
, x0y1
, x0y1
, interp
[ 1 ] );
1637 brw_MUL( p
, x1y1
, x1y1
, interp
[ 1 ] );
1638 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1639 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1641 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1642 time put the front face in tmp[ 1 ] and we're nearly there... */
1643 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1644 brw_MUL( p
, x1y0
, x1y0
, interp
[ 0 ] );
1645 brw_ADD( p
, tmp
[ 1 ], x0y0
, x1y0
);
1647 /* Another interpolation, in the z dimension: */
1648 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], negate( tmp
[ 0 ] ) );
1649 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], interp
[ 2 ] );
1650 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], tmp
[ 1 ] );
1652 /* Exit the loop if we've computed both cubes... */
1653 origin
= p
->nr_insn
;
1654 brw_push_insn_state( p
);
1655 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
1656 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1657 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1658 brw_pop_insn_state( p
);
1660 /* Save the result for the w=0 case, and increment the w coordinate: */
1661 brw_MOV( p
, w0
, tmp
[ 0 ] );
1662 brw_ADD( p
, high_words( floors
[ 1 ] ), high_words( floors
[ 1 ] ),
1665 /* Loop around for the other cube. Explicitly set the flag register
1666 (unfortunately we must spend an extra instruction to do this: we
1667 can't rely on a side effect of the previous MOV or ADD because
1668 conditional modifiers which are normally true might be false in
1669 exceptional circumstances, e.g. given a NaN input; the add to
1670 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1671 brw_push_insn_state( p
);
1672 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1673 brw_MOV( p
, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1674 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(),
1675 brw_imm_d( ( loop
- p
->nr_insn
) << 4 ) );
1676 brw_pop_insn_state( p
);
1678 /* Patch the previous conditional branch now that we know the
1679 destination address. */
1680 brw_set_src1( p
->store
+ origin
,
1681 brw_imm_d( ( p
->nr_insn
- origin
) << 4 ) );
1683 /* The very last interpolation. */
1684 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], negate( w0
) );
1685 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], interp
[ 3 ] );
1686 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], w0
);
1688 /* scale by pow( 2, -15 ), as described above */
1689 brw_MUL( p
, param
[ 0 ], tmp
[ 0 ], brw_imm_f( 0.000030517578125 ) );
1691 release_tmps( c
, mark
);
1694 static void emit_noise4( struct brw_wm_compile
*c
,
1695 const struct prog_instruction
*inst
)
1697 struct brw_compile
*p
= &c
->func
;
1698 struct brw_reg src0
, src1
, src2
, src3
, param0
, param1
, param2
, param3
, dst
;
1699 GLuint mask
= inst
->DstReg
.WriteMask
;
1701 int mark
= mark_tmps( c
);
1703 assert( mark
== 0 );
1705 src0
= get_src_reg( c
, inst
, 0, 0 );
1706 src1
= get_src_reg( c
, inst
, 0, 1 );
1707 src2
= get_src_reg( c
, inst
, 0, 2 );
1708 src3
= get_src_reg( c
, inst
, 0, 3 );
1710 param0
= alloc_tmp( c
);
1711 param1
= alloc_tmp( c
);
1712 param2
= alloc_tmp( c
);
1713 param3
= alloc_tmp( c
);
1715 brw_MOV( p
, param0
, src0
);
1716 brw_MOV( p
, param1
, src1
);
1717 brw_MOV( p
, param2
, src2
);
1718 brw_MOV( p
, param3
, src3
);
1720 invoke_subroutine( c
, SUB_NOISE4
, noise4_sub
);
1722 /* Fill in the result: */
1723 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1724 for (i
= 0 ; i
< 4; i
++) {
1725 if (mask
& (1<<i
)) {
1726 dst
= get_dst_reg(c
, inst
, i
);
1727 brw_MOV( p
, dst
, param0
);
1730 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1731 brw_set_saturate( p
, 0 );
1733 release_tmps( c
, mark
);
1737 * Resolve subroutine calls after code emit is done.
1739 static void post_wm_emit( struct brw_wm_compile
*c
)
1741 brw_resolve_cals(&c
->func
);
1745 get_argument_regs(struct brw_wm_compile
*c
,
1746 const struct prog_instruction
*inst
,
1748 struct brw_reg
*dst
,
1749 struct brw_reg
*regs
,
1752 struct brw_compile
*p
= &c
->func
;
1755 for (i
= 0; i
< 4; i
++) {
1756 if (mask
& (1 << i
)) {
1757 regs
[i
] = get_src_reg(c
, inst
, index
, i
);
1759 /* Unalias destination registers from our sources. */
1760 if (regs
[i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1761 for (j
= 0; j
< 4; j
++) {
1762 if (memcmp(®s
[i
], &dst
[j
], sizeof(regs
[0])) == 0) {
1763 struct brw_reg tmp
= alloc_tmp(c
);
1764 brw_MOV(p
, tmp
, regs
[i
]);
1774 static void brw_wm_emit_glsl(struct brw_context
*brw
, struct brw_wm_compile
*c
)
1776 struct intel_context
*intel
= &brw
->intel
;
1777 #define MAX_IF_DEPTH 32
1778 #define MAX_LOOP_DEPTH 32
1779 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
];
1780 GLuint i
, if_depth
= 0, loop_depth
= 0;
1781 struct brw_compile
*p
= &c
->func
;
1782 struct brw_indirect stack_index
= brw_indirect(0, 0);
1784 c
->out_of_regs
= GL_FALSE
;
1787 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1788 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1790 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
1791 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
1793 struct brw_reg args
[3][4], dst
[4];
1795 int mark
= mark_tmps( c
);
1800 printf("Inst %d: ", i
);
1801 _mesa_print_instruction(inst
);
1804 /* fetch any constants that this instruction needs */
1805 if (c
->fp
->use_const_buffer
)
1806 fetch_constants(c
, inst
);
1808 if (inst
->Opcode
!= OPCODE_ARL
) {
1809 for (j
= 0; j
< 4; j
++) {
1810 if (inst
->DstReg
.WriteMask
& (1 << j
))
1811 dst
[j
] = get_dst_reg(c
, inst
, j
);
1813 dst
[j
] = brw_null_reg();
1816 for (j
= 0; j
< brw_wm_nr_args(inst
->Opcode
); j
++)
1817 get_argument_regs(c
, inst
, j
, dst
, args
[j
], WRITEMASK_XYZW
);
1819 dst_flags
= inst
->DstReg
.WriteMask
;
1820 if (inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1821 dst_flags
|= SATURATE
;
1823 if (inst
->CondUpdate
)
1824 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NZ
);
1826 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
1828 switch (inst
->Opcode
) {
1830 emit_pixel_xy(c
, dst
, dst_flags
);
1833 emit_delta_xy(p
, dst
, dst_flags
, args
[0]);
1836 emit_pixel_w(c
, dst
, dst_flags
, args
[0], args
[1]);
1839 emit_linterp(p
, dst
, dst_flags
, args
[0], args
[1]);
1842 emit_pinterp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1845 emit_cinterp(p
, dst
, dst_flags
, args
[0]);
1848 emit_wpos_xy(c
, dst
, dst_flags
, args
[0]);
1851 emit_fb_write(c
, args
[0], args
[1], args
[2],
1852 INST_AUX_GET_TARGET(inst
->Aux
),
1853 inst
->Aux
& INST_AUX_EOT
);
1855 case WM_FRONTFACING
:
1856 emit_frontfacing(p
, dst
, dst_flags
);
1859 emit_alu2(p
, brw_ADD
, dst
, dst_flags
, args
[0], args
[1]);
1865 emit_alu1(p
, brw_FRC
, dst
, dst_flags
, args
[0]);
1868 emit_alu1(p
, brw_RNDD
, dst
, dst_flags
, args
[0]);
1871 emit_lrp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1874 emit_alu1(p
, brw_RNDZ
, dst
, dst_flags
, args
[0]);
1878 emit_alu1(p
, brw_MOV
, dst
, dst_flags
, args
[0]);
1881 emit_dp3(p
, dst
, dst_flags
, args
[0], args
[1]);
1884 emit_dp4(p
, dst
, dst_flags
, args
[0], args
[1]);
1887 emit_xpd(p
, dst
, dst_flags
, args
[0], args
[1]);
1890 emit_dph(p
, dst
, dst_flags
, args
[0], args
[1]);
1893 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, dst_flags
, args
[0]);
1896 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, dst_flags
, args
[0]);
1899 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, dst_flags
, args
[0]);
1902 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, dst_flags
, args
[0]);
1905 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, dst_flags
, args
[0]);
1908 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, dst_flags
, args
[0]);
1911 emit_cmp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1914 emit_min(p
, dst
, dst_flags
, args
[0], args
[1]);
1917 emit_max(p
, dst
, dst_flags
, args
[0], args
[1]);
1921 emit_ddxy(p
, dst
, dst_flags
, (inst
->Opcode
== OPCODE_DDX
),
1925 emit_sop(p
, dst
, dst_flags
,
1926 BRW_CONDITIONAL_L
, args
[0], args
[1]);
1929 emit_sop(p
, dst
, dst_flags
,
1930 BRW_CONDITIONAL_LE
, args
[0], args
[1]);
1933 emit_sop(p
, dst
, dst_flags
,
1934 BRW_CONDITIONAL_G
, args
[0], args
[1]);
1937 emit_sop(p
, dst
, dst_flags
,
1938 BRW_CONDITIONAL_GE
, args
[0], args
[1]);
1941 emit_sop(p
, dst
, dst_flags
,
1942 BRW_CONDITIONAL_EQ
, args
[0], args
[1]);
1945 emit_sop(p
, dst
, dst_flags
,
1946 BRW_CONDITIONAL_NEQ
, args
[0], args
[1]);
1949 emit_alu2(p
, brw_MUL
, dst
, dst_flags
, args
[0], args
[1]);
1952 emit_math2(c
, BRW_MATH_FUNCTION_POW
,
1953 dst
, dst_flags
, args
[0], args
[1]);
1956 emit_mad(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1959 emit_noise1(c
, inst
);
1962 emit_noise2(c
, inst
);
1965 emit_noise3(c
, inst
);
1968 emit_noise4(c
, inst
);
1971 emit_tex(c
, dst
, dst_flags
, args
[0],
1972 get_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
,
1976 (c
->key
.shadowtex_mask
& (1 << inst
->TexSrcUnit
)) != 0);
1979 emit_txb(c
, dst
, dst_flags
, args
[0],
1980 get_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
,
1983 c
->fp
->program
.Base
.SamplerUnits
[inst
->TexSrcUnit
]);
1989 assert(if_depth
< MAX_IF_DEPTH
);
1990 if_inst
[if_depth
++] = brw_IF(p
, BRW_EXECUTE_8
);
1993 assert(if_depth
> 0);
1994 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1997 assert(if_depth
> 0);
1998 brw_ENDIF(p
, if_inst
[--if_depth
]);
2001 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2007 brw_push_insn_state(p
);
2008 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
2009 brw_set_access_mode(p
, BRW_ALIGN_1
);
2010 brw_ADD(p
, deref_1ud(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2011 brw_set_access_mode(p
, BRW_ALIGN_16
);
2012 brw_ADD(p
, get_addr_reg(stack_index
),
2013 get_addr_reg(stack_index
), brw_imm_d(4));
2014 brw_save_call(&c
->func
, inst
->Comment
, p
->nr_insn
);
2015 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2016 brw_pop_insn_state(p
);
2020 brw_push_insn_state(p
);
2021 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
2022 brw_ADD(p
, get_addr_reg(stack_index
),
2023 get_addr_reg(stack_index
), brw_imm_d(-4));
2024 brw_set_access_mode(p
, BRW_ALIGN_1
);
2025 brw_MOV(p
, brw_ip_reg(), deref_1ud(stack_index
, 0));
2026 brw_set_access_mode(p
, BRW_ALIGN_16
);
2027 brw_pop_insn_state(p
);
2030 case OPCODE_BGNLOOP
:
2031 /* XXX may need to invalidate the current_constant regs */
2032 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2036 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2040 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2042 case OPCODE_ENDLOOP
:
2044 struct brw_instruction
*inst0
, *inst1
;
2047 if (intel
->is_ironlake
)
2050 assert(loop_depth
> 0);
2052 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2053 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2054 while (inst0
> loop_inst
[loop_depth
]) {
2056 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
2057 inst0
->bits3
.if_else
.jump_count
== 0) {
2058 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2059 inst0
->bits3
.if_else
.pop_count
= 0;
2061 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
2062 inst0
->bits3
.if_else
.jump_count
== 0) {
2063 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2064 inst0
->bits3
.if_else
.pop_count
= 0;
2070 printf("unsupported opcode %d (%s) in fragment shader\n",
2071 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
2072 _mesa_opcode_string(inst
->Opcode
) : "unknown");
2075 /* Release temporaries containing any unaliased source regs. */
2076 release_tmps( c
, mark
);
2078 if (inst
->CondUpdate
)
2079 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
2081 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2085 if (INTEL_DEBUG
& DEBUG_WM
) {
2086 printf("wm-native:\n");
2087 for (i
= 0; i
< p
->nr_insn
; i
++)
2088 brw_disasm(stderr
, &p
->store
[i
]);
2094 * Do GPU code generation for shaders that use GLSL features such as
2095 * flow control. Other shaders will be compiled with the
2097 void brw_wm_glsl_emit(struct brw_context
*brw
, struct brw_wm_compile
*c
)
2099 if (INTEL_DEBUG
& DEBUG_WM
) {
2100 printf("brw_wm_glsl_emit:\n");
2103 /* initial instruction translation/simplification */
2106 /* actual code generation */
2107 brw_wm_emit_glsl(brw
, c
);
2109 if (INTEL_DEBUG
& DEBUG_WM
) {
2110 brw_wm_print_program(c
, "brw_wm_glsl_emit done");
2113 c
->prog_data
.total_grf
= num_grf_used(c
);
2114 c
->prog_data
.total_scratch
= 0;