1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
10 SUB_NOISE1
, SUB_NOISE2
, SUB_NOISE3
, SUB_NOISE4
13 static struct brw_reg
get_dst_reg(struct brw_wm_compile
*c
,
14 const struct prog_instruction
*inst
,
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
22 GLboolean
brw_wm_is_glsl(const struct gl_fragment_program
*fp
)
26 for (i
= 0; i
< fp
->Base
.NumInstructions
; i
++) {
27 const struct prog_instruction
*inst
= &fp
->Base
.Instructions
[i
];
28 switch (inst
->Opcode
) {
51 reclaim_temps(struct brw_wm_compile
*c
);
54 /** Mark GRF register as used. */
56 prealloc_grf(struct brw_wm_compile
*c
, int r
)
58 c
->used_grf
[r
] = GL_TRUE
;
62 /** Mark given GRF register as not in use. */
64 release_grf(struct brw_wm_compile
*c
, int r
)
66 /*assert(c->used_grf[r]);*/
67 c
->used_grf
[r
] = GL_FALSE
;
68 c
->first_free_grf
= MIN2(c
->first_free_grf
, r
);
72 /** Return index of a free GRF, mark it as used. */
74 alloc_grf(struct brw_wm_compile
*c
)
77 for (r
= c
->first_free_grf
; r
< BRW_WM_MAX_GRF
; r
++) {
78 if (!c
->used_grf
[r
]) {
79 c
->used_grf
[r
] = GL_TRUE
;
80 c
->first_free_grf
= r
+ 1; /* a guess */
85 /* no free temps, try to reclaim some */
87 c
->first_free_grf
= 0;
90 for (r
= c
->first_free_grf
; r
< BRW_WM_MAX_GRF
; r
++) {
91 if (!c
->used_grf
[r
]) {
92 c
->used_grf
[r
] = GL_TRUE
;
93 c
->first_free_grf
= r
+ 1; /* a guess */
98 for (r
= 0; r
< BRW_WM_MAX_GRF
; r
++) {
99 assert(c
->used_grf
[r
]);
102 /* really, no free GRF regs found */
103 if (!c
->out_of_regs
) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL
, "i965: ran out of registers for fragment program");
106 c
->out_of_regs
= GL_TRUE
;
113 /** Return number of GRF registers used */
115 num_grf_used(const struct brw_wm_compile
*c
)
118 for (r
= BRW_WM_MAX_GRF
- 1; r
>= 0; r
--)
127 * Record the mapping of a Mesa register to a hardware register.
129 static void set_reg(struct brw_wm_compile
*c
, int file
, int index
,
130 int component
, struct brw_reg reg
)
132 c
->wm_regs
[file
][index
][component
].reg
= reg
;
133 c
->wm_regs
[file
][index
][component
].inited
= GL_TRUE
;
136 static struct brw_reg
alloc_tmp(struct brw_wm_compile
*c
)
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c
->tmp_index
== c
->tmp_max
) {
142 int r
= alloc_grf(c
);
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r
= 50; /* XXX random register! */
147 c
->tmp_regs
[ c
->tmp_max
++ ] = r
;
150 /* form the GRF register */
151 reg
= brw_vec8_grf(c
->tmp_regs
[ c
->tmp_index
++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg
.nr
< BRW_WM_MAX_GRF
);
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
162 static int mark_tmps(struct brw_wm_compile
*c
)
167 static struct brw_reg
lookup_tmp( struct brw_wm_compile
*c
, int index
)
169 return brw_vec8_grf( c
->tmp_regs
[ index
], 0 );
172 static void release_tmps(struct brw_wm_compile
*c
, int mark
)
178 * Convert Mesa src register to brw register.
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
190 static struct brw_reg
191 get_reg(struct brw_wm_compile
*c
, int file
, int index
, int component
,
192 int nr
, GLuint neg
, GLuint abs
)
196 case PROGRAM_STATE_VAR
:
197 case PROGRAM_CONSTANT
:
198 case PROGRAM_UNIFORM
:
199 file
= PROGRAM_STATE_VAR
;
201 case PROGRAM_UNDEFINED
:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY
:
206 case PROGRAM_PAYLOAD
:
209 _mesa_problem(NULL
, "Unexpected file in get_reg()");
210 return brw_null_reg();
214 assert(component
< 4);
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c
->wm_regs
[file
][index
][component
].inited
) {
219 reg
= c
->wm_regs
[file
][index
][component
].reg
;
222 /* no, allocate new register */
223 int grf
= alloc_grf(c
);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
226 /* totally out of temps */
227 grf
= 51; /* XXX random register! */
230 reg
= brw_vec8_grf(grf
, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
233 set_reg(c
, file
, index
, component
, reg
);
236 if (neg
& (1 << component
)) {
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
251 reclaim_temps(struct brw_wm_compile
*c
)
253 GLint intBegin
[MAX_PROGRAM_TEMPS
];
254 GLint intEnd
[MAX_PROGRAM_TEMPS
];
257 /*printf("Reclaim temps:\n");*/
259 _mesa_find_temp_intervals(c
->prog_instructions
, c
->nr_fp_insns
,
262 for (index
= 0; index
< MAX_PROGRAM_TEMPS
; index
++) {
263 if (intEnd
[index
] != -1 && intEnd
[index
] < c
->cur_inst
) {
264 /* program temp[i] can be freed */
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component
= 0; component
< 4; component
++) {
268 if (c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].inited
) {
269 int r
= c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].reg
.nr
;
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
275 c
->wm_regs
[PROGRAM_TEMPORARY
][index
][component
].inited
= GL_FALSE
;
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
290 static void prealloc_reg(struct brw_wm_compile
*c
)
294 int urb_read_length
= 0;
295 GLuint inputs
= FRAG_BIT_WPOS
| c
->fp_interp_emitted
;
296 GLuint reg_index
= 0;
298 memset(c
->used_grf
, GL_FALSE
, sizeof(c
->used_grf
));
299 c
->first_free_grf
= 0;
301 for (i
= 0; i
< 4; i
++) {
302 if (i
< c
->key
.nr_depth_regs
)
303 reg
= brw_vec8_grf(i
* 2, 0);
305 reg
= brw_vec8_grf(0, 0);
306 set_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
, i
, reg
);
308 reg_index
+= 2 * c
->key
.nr_depth_regs
;
312 const GLuint nr_params
= c
->fp
->program
.Base
.Parameters
->NumParameters
;
313 const GLuint nr_temps
= c
->fp
->program
.Base
.NumTemporaries
;
315 /* use a real constant buffer, or just use a section of the GRF? */
316 /* XXX this heuristic may need adjustment... */
317 if ((nr_params
+ nr_temps
) * 4 + reg_index
> 80)
318 c
->fp
->use_const_buffer
= GL_TRUE
;
320 c
->fp
->use_const_buffer
= GL_FALSE
;
321 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
323 if (c
->fp
->use_const_buffer
) {
324 /* We'll use a real constant buffer and fetch constants from
325 * it with a dataport read message.
328 /* number of float constants in CURBE */
329 c
->prog_data
.nr_params
= 0;
332 const struct gl_program_parameter_list
*plist
=
333 c
->fp
->program
.Base
.Parameters
;
336 /* number of float constants in CURBE */
337 c
->prog_data
.nr_params
= 4 * nr_params
;
339 /* loop over program constants (float[4]) */
340 for (i
= 0; i
< nr_params
; i
++) {
341 /* loop over XYZW channels */
342 for (j
= 0; j
< 4; j
++, index
++) {
343 reg
= brw_vec1_grf(reg_index
+ index
/ 8, index
% 8);
344 /* Save pointer to parameter/constant value.
345 * Constants will be copied in prepare_constant_buffer()
347 c
->prog_data
.param
[index
] = &plist
->ParameterValues
[i
][j
];
348 set_reg(c
, PROGRAM_STATE_VAR
, i
, j
, reg
);
351 /* number of constant regs used (each reg is float[8]) */
352 c
->nr_creg
= 2 * ((4 * nr_params
+ 15) / 16);
353 reg_index
+= c
->nr_creg
;
357 /* fragment shader inputs */
358 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
361 if (i
>= VERT_RESULT_VAR0
)
362 fp_input
= i
- VERT_RESULT_VAR0
+ FRAG_ATTRIB_VAR0
;
363 else if (i
<= VERT_RESULT_TEX7
)
368 if (fp_input
>= 0 && inputs
& (1 << fp_input
)) {
369 urb_read_length
= reg_index
;
370 reg
= brw_vec8_grf(reg_index
, 0);
371 for (j
= 0; j
< 4; j
++)
372 set_reg(c
, PROGRAM_PAYLOAD
, fp_input
, j
, reg
);
374 if (c
->key
.vp_outputs_written
& (1 << i
)) {
379 c
->prog_data
.first_curbe_grf
= c
->key
.nr_depth_regs
* 2;
380 c
->prog_data
.urb_read_length
= urb_read_length
;
381 c
->prog_data
.curb_read_length
= c
->nr_creg
;
382 c
->emit_mask_reg
= brw_uw1_reg(BRW_GENERAL_REGISTER_FILE
, reg_index
, 0);
384 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg_index
, 0);
387 /* mark GRF regs [0..reg_index-1] as in-use */
388 for (i
= 0; i
< reg_index
; i
++)
391 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
392 prealloc_grf(c
, 126);
393 prealloc_grf(c
, 127);
395 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
396 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
397 struct brw_reg dst
[4];
399 switch (inst
->Opcode
) {
402 /* Allocate the channels of texture results contiguously,
403 * since they are written out that way by the sampler unit.
405 for (j
= 0; j
< 4; j
++) {
406 dst
[j
] = get_dst_reg(c
, inst
, j
);
408 assert(dst
[j
].nr
== dst
[j
- 1].nr
+ 1);
416 /* An instruction may reference up to three constants.
417 * They'll be found in these registers.
418 * XXX alloc these on demand!
420 if (c
->fp
->use_const_buffer
) {
421 for (i
= 0; i
< 3; i
++) {
422 c
->current_const
[i
].index
= -1;
423 c
->current_const
[i
].reg
= brw_vec8_grf(alloc_grf(c
), 0);
427 printf("USE CONST BUFFER? %d\n", c
->fp
->use_const_buffer
);
428 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index
);
434 * Check if any of the instruction's src registers are constants, uniforms,
435 * or statevars. If so, fetch any constants that we don't already have in
436 * the three GRF slots.
438 static void fetch_constants(struct brw_wm_compile
*c
,
439 const struct prog_instruction
*inst
)
441 struct brw_compile
*p
= &c
->func
;
444 /* loop over instruction src regs */
445 for (i
= 0; i
< 3; i
++) {
446 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
447 if (src
->File
== PROGRAM_STATE_VAR
||
448 src
->File
== PROGRAM_CONSTANT
||
449 src
->File
== PROGRAM_UNIFORM
) {
450 c
->current_const
[i
].index
= src
->Index
;
453 printf(" fetch const[%d] for arg %d into reg %d\n",
454 src
->Index
, i
, c
->current_const
[i
].reg
.nr
);
457 /* need to fetch the constant now */
459 c
->current_const
[i
].reg
, /* writeback dest */
460 src
->RelAddr
, /* relative indexing? */
461 16 * src
->Index
, /* byte offset */
462 SURF_INDEX_FRAG_CONST_BUFFER
/* binding table index */
470 * Convert Mesa dst register to brw register.
472 static struct brw_reg
get_dst_reg(struct brw_wm_compile
*c
,
473 const struct prog_instruction
*inst
,
477 return get_reg(c
, inst
->DstReg
.File
, inst
->DstReg
.Index
, component
, nr
,
482 static struct brw_reg
483 get_src_reg_const(struct brw_wm_compile
*c
,
484 const struct prog_instruction
*inst
,
485 GLuint srcRegIndex
, GLuint component
)
487 /* We should have already fetched the constant from the constant
488 * buffer in fetch_constants(). Now we just have to return a
489 * register description that extracts the needed component and
490 * smears it across all eight vector components.
492 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
493 struct brw_reg const_reg
;
495 assert(component
< 4);
496 assert(srcRegIndex
< 3);
497 assert(c
->current_const
[srcRegIndex
].index
!= -1);
498 const_reg
= c
->current_const
[srcRegIndex
].reg
;
500 /* extract desired float from the const_reg, and smear */
501 const_reg
= stride(const_reg
, 0, 1, 0);
502 const_reg
.subnr
= component
* 4;
504 if (src
->Negate
& (1 << component
))
505 const_reg
= negate(const_reg
);
507 const_reg
= brw_abs(const_reg
);
510 printf(" form const[%d].%d for arg %d, reg %d\n",
511 c
->current_const
[srcRegIndex
].index
,
522 * Convert Mesa src register to brw register.
524 static struct brw_reg
get_src_reg(struct brw_wm_compile
*c
,
525 const struct prog_instruction
*inst
,
526 GLuint srcRegIndex
, GLuint channel
)
528 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
530 const GLuint component
= GET_SWZ(src
->Swizzle
, channel
);
532 /* Extended swizzle terms */
533 if (component
== SWIZZLE_ZERO
) {
534 return brw_imm_f(0.0F
);
536 else if (component
== SWIZZLE_ONE
) {
537 return brw_imm_f(1.0F
);
540 if (c
->fp
->use_const_buffer
&&
541 (src
->File
== PROGRAM_STATE_VAR
||
542 src
->File
== PROGRAM_CONSTANT
||
543 src
->File
== PROGRAM_UNIFORM
)) {
544 return get_src_reg_const(c
, inst
, srcRegIndex
, component
);
547 /* other type of source register */
548 return get_reg(c
, src
->File
, src
->Index
, component
, nr
,
549 src
->Negate
, src
->Abs
);
555 * Same as \sa get_src_reg() but if the register is a literal, emit
556 * a brw_reg encoding the literal.
557 * Note that a brw instruction only allows one src operand to be a literal.
558 * For instructions with more than one operand, only the second can be a
559 * literal. This means that we treat some literals as constants/uniforms
560 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
563 static struct brw_reg
get_src_reg_imm(struct brw_wm_compile
*c
,
564 const struct prog_instruction
*inst
,
565 GLuint srcRegIndex
, GLuint channel
)
567 const struct prog_src_register
*src
= &inst
->SrcReg
[srcRegIndex
];
568 if (src
->File
== PROGRAM_CONSTANT
) {
570 const int component
= GET_SWZ(src
->Swizzle
, channel
);
571 const GLfloat
*param
=
572 c
->fp
->program
.Base
.Parameters
->ParameterValues
[src
->Index
];
573 GLfloat value
= param
[component
];
574 if (src
->Negate
& (1 << channel
))
577 value
= FABSF(value
);
579 printf(" form immed value %f for chan %d\n", value
, channel
);
581 return brw_imm_f(value
);
584 return get_src_reg(c
, inst
, srcRegIndex
, channel
);
590 * Subroutines are minimal support for resusable instruction sequences.
591 * They are implemented as simply as possible to minimise overhead: there
592 * is no explicit support for communication between the caller and callee
593 * other than saving the return address in a temporary register, nor is
594 * there any automatic local storage. This implies that great care is
595 * required before attempting reentrancy or any kind of nested
596 * subroutine invocations.
598 static void invoke_subroutine( struct brw_wm_compile
*c
,
599 enum _subroutine subroutine
,
600 void (*emit
)( struct brw_wm_compile
* ) )
602 struct brw_compile
*p
= &c
->func
;
604 assert( subroutine
< BRW_WM_MAX_SUBROUTINE
);
606 if( c
->subroutines
[ subroutine
] ) {
607 /* subroutine previously emitted: reuse existing instructions */
609 int mark
= mark_tmps( c
);
610 struct brw_reg return_address
= retype( alloc_tmp( c
),
611 BRW_REGISTER_TYPE_UD
);
612 int here
= p
->nr_insn
;
614 brw_push_insn_state(p
);
615 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
616 brw_ADD( p
, return_address
, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
618 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(),
619 brw_imm_d( ( c
->subroutines
[ subroutine
] -
621 brw_pop_insn_state(p
);
623 release_tmps( c
, mark
);
625 /* previously unused subroutine: emit, and mark for later reuse */
627 int mark
= mark_tmps( c
);
628 struct brw_reg return_address
= retype( alloc_tmp( c
),
629 BRW_REGISTER_TYPE_UD
);
630 struct brw_instruction
*calc
;
631 int base
= p
->nr_insn
;
633 brw_push_insn_state(p
);
634 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
635 calc
= brw_ADD( p
, return_address
, brw_ip_reg(), brw_imm_ud( 0 ) );
636 brw_pop_insn_state(p
);
638 c
->subroutines
[ subroutine
] = p
->nr_insn
;
642 brw_push_insn_state(p
);
643 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
644 brw_MOV( p
, brw_ip_reg(), return_address
);
645 brw_pop_insn_state(p
);
647 brw_set_src1( calc
, brw_imm_ud( ( p
->nr_insn
- base
) << 4 ) );
649 release_tmps( c
, mark
);
653 /* Workaround for using brw_wm_emit.c's emit functions, which expect
654 * destination regs to be uniquely written. Moves arguments out to
655 * temporaries as necessary for instructions which use their destination as
659 unalias3(struct brw_wm_compile
*c
,
660 void (*func
)(struct brw_compile
*c
,
661 const struct brw_reg
*dst
,
663 const struct brw_reg
*arg0
,
664 const struct brw_reg
*arg1
,
665 const struct brw_reg
*arg2
),
666 const struct brw_reg
*dst
,
668 const struct brw_reg
*arg0
,
669 const struct brw_reg
*arg1
,
670 const struct brw_reg
*arg2
)
672 struct brw_compile
*p
= &c
->func
;
673 struct brw_reg tmp_arg0
[4], tmp_arg1
[4], tmp_arg2
[4];
675 int mark
= mark_tmps(c
);
677 for (j
= 0; j
< 4; j
++) {
678 tmp_arg0
[j
] = arg0
[j
];
679 tmp_arg1
[j
] = arg1
[j
];
680 tmp_arg2
[j
] = arg2
[j
];
683 for (i
= 0; i
< 4; i
++) {
685 for (j
= 0; j
< 4; j
++) {
686 if (arg0
[j
].file
== dst
[i
].file
&&
687 dst
[i
].nr
== arg0
[j
].nr
) {
688 tmp_arg0
[j
] = alloc_tmp(c
);
689 brw_MOV(p
, tmp_arg0
[j
], arg0
[j
]);
691 if (arg1
[j
].file
== dst
[i
].file
&&
692 dst
[i
].nr
== arg1
[j
].nr
) {
693 tmp_arg1
[j
] = alloc_tmp(c
);
694 brw_MOV(p
, tmp_arg1
[j
], arg1
[j
]);
696 if (arg2
[j
].file
== dst
[i
].file
&&
697 dst
[i
].nr
== arg2
[j
].nr
) {
698 tmp_arg2
[j
] = alloc_tmp(c
);
699 brw_MOV(p
, tmp_arg2
[j
], arg2
[j
]);
705 func(p
, dst
, mask
, tmp_arg0
, tmp_arg1
, tmp_arg2
);
707 release_tmps(c
, mark
);
710 static void emit_pixel_xy(struct brw_wm_compile
*c
,
711 const struct prog_instruction
*inst
)
713 struct brw_reg r1
= brw_vec1_grf(1, 0);
714 struct brw_reg r1_uw
= retype(r1
, BRW_REGISTER_TYPE_UW
);
716 struct brw_reg dst0
, dst1
;
717 struct brw_compile
*p
= &c
->func
;
718 GLuint mask
= inst
->DstReg
.WriteMask
;
720 dst0
= get_dst_reg(c
, inst
, 0);
721 dst1
= get_dst_reg(c
, inst
, 1);
722 /* Calculate pixel centers by adding 1 or 0 to each of the
723 * micro-tile coordinates passed in r1.
725 if (mask
& WRITEMASK_X
) {
727 vec8(retype(dst0
, BRW_REGISTER_TYPE_UW
)),
728 stride(suboffset(r1_uw
, 4), 2, 4, 0),
729 brw_imm_v(0x10101010));
732 if (mask
& WRITEMASK_Y
) {
734 vec8(retype(dst1
, BRW_REGISTER_TYPE_UW
)),
735 stride(suboffset(r1_uw
, 5), 2, 4, 0),
736 brw_imm_v(0x11001100));
740 static void emit_delta_xy(struct brw_wm_compile
*c
,
741 const struct prog_instruction
*inst
)
743 struct brw_reg r1
= brw_vec1_grf(1, 0);
744 struct brw_reg dst0
, dst1
, src0
, src1
;
745 struct brw_compile
*p
= &c
->func
;
746 GLuint mask
= inst
->DstReg
.WriteMask
;
748 dst0
= get_dst_reg(c
, inst
, 0);
749 dst1
= get_dst_reg(c
, inst
, 1);
750 src0
= get_src_reg(c
, inst
, 0, 0);
751 src1
= get_src_reg(c
, inst
, 0, 1);
752 /* Calc delta X,Y by subtracting origin in r1 from the pixel
755 if (mask
& WRITEMASK_X
) {
758 retype(src0
, BRW_REGISTER_TYPE_UW
),
762 if (mask
& WRITEMASK_Y
) {
765 retype(src1
, BRW_REGISTER_TYPE_UW
),
766 negate(suboffset(r1
,1)));
771 static void fire_fb_write( struct brw_wm_compile
*c
,
777 struct brw_compile
*p
= &c
->func
;
778 /* Pass through control information:
780 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
782 brw_push_insn_state(p
);
783 brw_set_mask_control(p
, BRW_MASK_DISABLE
); /* ? */
785 brw_message_reg(base_reg
+ 1),
787 brw_pop_insn_state(p
);
789 /* Send framebuffer write message: */
791 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
),
793 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW
),
800 static void emit_fb_write(struct brw_wm_compile
*c
,
801 const struct prog_instruction
*inst
)
803 struct brw_compile
*p
= &c
->func
;
809 /* Reserve a space for AA - may not be needed:
811 if (c
->key
.aa_dest_stencil_reg
)
814 brw_push_insn_state(p
);
815 for (channel
= 0; channel
< 4; channel
++) {
816 src0
= get_src_reg(c
, inst
, 0, channel
);
817 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
818 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
819 brw_MOV(p
, brw_message_reg(nr
+ channel
), src0
);
821 /* skip over the regs populated above: */
823 brw_pop_insn_state(p
);
825 if (c
->key
.source_depth_to_render_target
) {
826 if (c
->key
.computes_depth
) {
827 src0
= get_src_reg(c
, inst
, 2, 2);
828 brw_MOV(p
, brw_message_reg(nr
), src0
);
831 src0
= get_src_reg(c
, inst
, 1, 1);
832 brw_MOV(p
, brw_message_reg(nr
), src0
);
838 if (c
->key
.dest_depth_reg
) {
839 const GLuint comp
= c
->key
.dest_depth_reg
/ 2;
840 const GLuint off
= c
->key
.dest_depth_reg
% 2;
843 /* XXX this code needs review/testing */
844 struct brw_reg arg1_0
= get_src_reg(c
, inst
, 1, comp
);
845 struct brw_reg arg1_1
= get_src_reg(c
, inst
, 1, comp
+1);
847 brw_push_insn_state(p
);
848 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
850 brw_MOV(p
, brw_message_reg(nr
), offset(arg1_0
, 1));
852 brw_MOV(p
, brw_message_reg(nr
+1), arg1_1
);
853 brw_pop_insn_state(p
);
857 struct brw_reg src
= get_src_reg(c
, inst
, 1, 1);
858 brw_MOV(p
, brw_message_reg(nr
), src
);
863 target
= INST_AUX_GET_TARGET(inst
->Aux
);
864 eot
= inst
->Aux
& INST_AUX_EOT
;
865 fire_fb_write(c
, 0, nr
, target
, eot
);
868 static void emit_pixel_w( struct brw_wm_compile
*c
,
869 const struct prog_instruction
*inst
)
871 struct brw_compile
*p
= &c
->func
;
872 GLuint mask
= inst
->DstReg
.WriteMask
;
873 if (mask
& WRITEMASK_W
) {
874 struct brw_reg dst
, src0
, delta0
, delta1
;
875 struct brw_reg interp3
;
877 dst
= get_dst_reg(c
, inst
, 3);
878 src0
= get_src_reg(c
, inst
, 0, 0);
879 delta0
= get_src_reg(c
, inst
, 1, 0);
880 delta1
= get_src_reg(c
, inst
, 1, 1);
882 interp3
= brw_vec1_grf(src0
.nr
+1, 4);
883 /* Calc 1/w - just linterp wpos[3] optimized by putting the
884 * result straight into a message reg.
886 brw_LINE(p
, brw_null_reg(), interp3
, delta0
);
887 brw_MAC(p
, brw_message_reg(2), suboffset(interp3
, 1), delta1
);
891 BRW_MATH_FUNCTION_INV
,
892 BRW_MATH_SATURATE_NONE
,
894 BRW_MATH_PRECISION_FULL
);
898 static void emit_linterp(struct brw_wm_compile
*c
,
899 const struct prog_instruction
*inst
)
901 struct brw_compile
*p
= &c
->func
;
902 GLuint mask
= inst
->DstReg
.WriteMask
;
903 struct brw_reg interp
[4];
904 struct brw_reg dst
, delta0
, delta1
;
908 src0
= get_src_reg(c
, inst
, 0, 0);
909 delta0
= get_src_reg(c
, inst
, 1, 0);
910 delta1
= get_src_reg(c
, inst
, 1, 1);
913 interp
[0] = brw_vec1_grf(nr
, 0);
914 interp
[1] = brw_vec1_grf(nr
, 4);
915 interp
[2] = brw_vec1_grf(nr
+1, 0);
916 interp
[3] = brw_vec1_grf(nr
+1, 4);
918 for(i
= 0; i
< 4; i
++ ) {
920 dst
= get_dst_reg(c
, inst
, i
);
921 brw_LINE(p
, brw_null_reg(), interp
[i
], delta0
);
922 brw_MAC(p
, dst
, suboffset(interp
[i
],1), delta1
);
927 static void emit_cinterp(struct brw_wm_compile
*c
,
928 const struct prog_instruction
*inst
)
930 struct brw_compile
*p
= &c
->func
;
931 GLuint mask
= inst
->DstReg
.WriteMask
;
933 struct brw_reg interp
[4];
934 struct brw_reg dst
, src0
;
937 src0
= get_src_reg(c
, inst
, 0, 0);
940 interp
[0] = brw_vec1_grf(nr
, 0);
941 interp
[1] = brw_vec1_grf(nr
, 4);
942 interp
[2] = brw_vec1_grf(nr
+1, 0);
943 interp
[3] = brw_vec1_grf(nr
+1, 4);
945 for(i
= 0; i
< 4; i
++ ) {
947 dst
= get_dst_reg(c
, inst
, i
);
948 brw_MOV(p
, dst
, suboffset(interp
[i
],3));
953 static void emit_pinterp(struct brw_wm_compile
*c
,
954 const struct prog_instruction
*inst
)
956 struct brw_compile
*p
= &c
->func
;
957 GLuint mask
= inst
->DstReg
.WriteMask
;
959 struct brw_reg interp
[4];
960 struct brw_reg dst
, delta0
, delta1
;
961 struct brw_reg src0
, w
;
964 src0
= get_src_reg(c
, inst
, 0, 0);
965 delta0
= get_src_reg(c
, inst
, 1, 0);
966 delta1
= get_src_reg(c
, inst
, 1, 1);
967 w
= get_src_reg(c
, inst
, 2, 3);
970 interp
[0] = brw_vec1_grf(nr
, 0);
971 interp
[1] = brw_vec1_grf(nr
, 4);
972 interp
[2] = brw_vec1_grf(nr
+1, 0);
973 interp
[3] = brw_vec1_grf(nr
+1, 4);
975 for(i
= 0; i
< 4; i
++ ) {
977 dst
= get_dst_reg(c
, inst
, i
);
978 brw_LINE(p
, brw_null_reg(), interp
[i
], delta0
);
979 brw_MAC(p
, dst
, suboffset(interp
[i
],1),
981 brw_MUL(p
, dst
, dst
, w
);
986 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
987 static void emit_frontfacing(struct brw_wm_compile
*c
,
988 const struct prog_instruction
*inst
)
990 struct brw_compile
*p
= &c
->func
;
991 struct brw_reg r1_6ud
= retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD
);
993 GLuint mask
= inst
->DstReg
.WriteMask
;
996 for (i
= 0; i
< 4; i
++) {
998 dst
= get_dst_reg(c
, inst
, i
);
999 brw_MOV(p
, dst
, brw_imm_f(0.0));
1003 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1006 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, r1_6ud
, brw_imm_ud(1 << 31));
1007 for (i
= 0; i
< 4; i
++) {
1008 if (mask
& (1<<i
)) {
1009 dst
= get_dst_reg(c
, inst
, i
);
1010 brw_MOV(p
, dst
, brw_imm_f(1.0));
1013 brw_set_predicate_control_flag_value(p
, 0xff);
1016 static void emit_xpd(struct brw_wm_compile
*c
,
1017 const struct prog_instruction
*inst
)
1020 struct brw_compile
*p
= &c
->func
;
1021 GLuint mask
= inst
->DstReg
.WriteMask
;
1022 for (i
= 0; i
< 4; i
++) {
1023 GLuint i2
= (i
+2)%3;
1024 GLuint i1
= (i
+1)%3;
1025 if (mask
& (1<<i
)) {
1026 struct brw_reg src0
, src1
, dst
;
1027 dst
= get_dst_reg(c
, inst
, i
);
1028 src0
= negate(get_src_reg(c
, inst
, 0, i2
));
1029 src1
= get_src_reg_imm(c
, inst
, 1, i1
);
1030 brw_MUL(p
, brw_null_reg(), src0
, src1
);
1031 src0
= get_src_reg(c
, inst
, 0, i1
);
1032 src1
= get_src_reg_imm(c
, inst
, 1, i2
);
1033 brw_set_saturate(p
, inst
->SaturateMode
!= SATURATE_OFF
);
1034 brw_MAC(p
, dst
, src0
, src1
);
1035 brw_set_saturate(p
, 0);
1038 brw_set_saturate(p
, 0);
1042 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1043 * Note that the result of the function is smeared across the dest
1044 * register's X, Y, Z and W channels (subject to writemasking of course).
1046 static void emit_math1(struct brw_wm_compile
*c
,
1047 const struct prog_instruction
*inst
, GLuint func
)
1049 struct brw_compile
*p
= &c
->func
;
1050 struct brw_reg src0
, dst
;
1051 GLuint mask
= inst
->DstReg
.WriteMask
;
1052 int dst_chan
= _mesa_ffs(mask
& WRITEMASK_XYZW
) - 1;
1054 if (!(mask
& WRITEMASK_XYZW
))
1057 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
1059 /* Get first component of source register */
1060 dst
= get_dst_reg(c
, inst
, dst_chan
);
1061 src0
= get_src_reg(c
, inst
, 0, 0);
1063 brw_MOV(p
, brw_message_reg(2), src0
);
1067 (inst
->SaturateMode
!= SATURATE_OFF
) ? BRW_MATH_SATURATE_SATURATE
: BRW_MATH_SATURATE_NONE
,
1070 BRW_MATH_DATA_VECTOR
,
1071 BRW_MATH_PRECISION_FULL
);
1074 static void emit_rcp(struct brw_wm_compile
*c
,
1075 const struct prog_instruction
*inst
)
1077 emit_math1(c
, inst
, BRW_MATH_FUNCTION_INV
);
1080 static void emit_rsq(struct brw_wm_compile
*c
,
1081 const struct prog_instruction
*inst
)
1083 emit_math1(c
, inst
, BRW_MATH_FUNCTION_RSQ
);
1086 static void emit_sin(struct brw_wm_compile
*c
,
1087 const struct prog_instruction
*inst
)
1089 emit_math1(c
, inst
, BRW_MATH_FUNCTION_SIN
);
1092 static void emit_cos(struct brw_wm_compile
*c
,
1093 const struct prog_instruction
*inst
)
1095 emit_math1(c
, inst
, BRW_MATH_FUNCTION_COS
);
1098 static void emit_ex2(struct brw_wm_compile
*c
,
1099 const struct prog_instruction
*inst
)
1101 emit_math1(c
, inst
, BRW_MATH_FUNCTION_EXP
);
1104 static void emit_lg2(struct brw_wm_compile
*c
,
1105 const struct prog_instruction
*inst
)
1107 emit_math1(c
, inst
, BRW_MATH_FUNCTION_LOG
);
1110 static void emit_arl(struct brw_wm_compile
*c
,
1111 const struct prog_instruction
*inst
)
1113 struct brw_compile
*p
= &c
->func
;
1114 struct brw_reg src0
, addr_reg
;
1115 brw_set_saturate(p
, (inst
->SaturateMode
!= SATURATE_OFF
) ? 1 : 0);
1116 addr_reg
= brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
1117 BRW_ARF_ADDRESS
, 0);
1118 src0
= get_src_reg(c
, inst
, 0, 0); /* channel 0 */
1119 brw_MOV(p
, addr_reg
, src0
);
1120 brw_set_saturate(p
, 0);
1124 static void emit_min_max(struct brw_wm_compile
*c
,
1125 const struct prog_instruction
*inst
)
1127 struct brw_compile
*p
= &c
->func
;
1128 const GLuint mask
= inst
->DstReg
.WriteMask
;
1129 const int mark
= mark_tmps(c
);
1131 brw_push_insn_state(p
);
1132 for (i
= 0; i
< 4; i
++) {
1133 if (mask
& (1<<i
)) {
1134 struct brw_reg real_dst
= get_dst_reg(c
, inst
, i
);
1135 struct brw_reg src0
= get_src_reg(c
, inst
, 0, i
);
1136 struct brw_reg src1
= get_src_reg(c
, inst
, 1, i
);
1138 /* if dst==src0 or dst==src1 we need to use a temp reg */
1139 GLboolean use_temp
= brw_same_reg(dst
, src0
) ||
1140 brw_same_reg(dst
, src1
);
1147 printf(" Min/max: dst %d src0 %d src1 %d\n",
1148 dst.nr, src0.nr, src1.nr);
1150 brw_set_saturate(p
, (inst
->SaturateMode
!= SATURATE_OFF
) ? 1 : 0);
1151 brw_MOV(p
, dst
, src0
);
1152 brw_set_saturate(p
, 0);
1154 if (inst
->Opcode
== OPCODE_MIN
)
1155 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, src1
, src0
);
1157 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, src1
, src0
);
1159 brw_set_saturate(p
, (inst
->SaturateMode
!= SATURATE_OFF
) ? 1 : 0);
1160 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
1161 brw_MOV(p
, dst
, src1
);
1162 brw_set_saturate(p
, 0);
1163 brw_set_predicate_control_flag_value(p
, 0xff);
1165 brw_MOV(p
, real_dst
, dst
);
1168 brw_pop_insn_state(p
);
1169 release_tmps(c
, mark
);
1172 static void emit_pow(struct brw_wm_compile
*c
,
1173 const struct prog_instruction
*inst
)
1175 struct brw_compile
*p
= &c
->func
;
1176 struct brw_reg dst
, src0
, src1
;
1177 GLuint mask
= inst
->DstReg
.WriteMask
;
1178 int dst_chan
= _mesa_ffs(mask
& WRITEMASK_XYZW
) - 1;
1180 if (!(mask
& WRITEMASK_XYZW
))
1183 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
1185 dst
= get_dst_reg(c
, inst
, dst_chan
);
1186 src0
= get_src_reg_imm(c
, inst
, 0, 0);
1187 src1
= get_src_reg_imm(c
, inst
, 1, 0);
1189 brw_MOV(p
, brw_message_reg(2), src0
);
1190 brw_MOV(p
, brw_message_reg(3), src1
);
1194 BRW_MATH_FUNCTION_POW
,
1195 (inst
->SaturateMode
!= SATURATE_OFF
) ? BRW_MATH_SATURATE_SATURATE
: BRW_MATH_SATURATE_NONE
,
1198 BRW_MATH_DATA_VECTOR
,
1199 BRW_MATH_PRECISION_FULL
);
1203 * For GLSL shaders, this KIL will be unconditional.
1204 * It may be contained inside an IF/ENDIF structure of course.
1206 static void emit_kil(struct brw_wm_compile
*c
)
1208 struct brw_compile
*p
= &c
->func
;
1209 struct brw_reg depth
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
1210 brw_push_insn_state(p
);
1211 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
1212 brw_NOT(p
, c
->emit_mask_reg
, brw_mask_reg(1)); //IMASK
1213 brw_AND(p
, depth
, c
->emit_mask_reg
, depth
);
1214 brw_pop_insn_state(p
);
1217 static INLINE
struct brw_reg
high_words( struct brw_reg reg
)
1219 return stride( suboffset( retype( reg
, BRW_REGISTER_TYPE_W
), 1 ),
1223 static INLINE
struct brw_reg
low_words( struct brw_reg reg
)
1225 return stride( retype( reg
, BRW_REGISTER_TYPE_W
), 0, 8, 2 );
1228 static INLINE
struct brw_reg
even_bytes( struct brw_reg reg
)
1230 return stride( retype( reg
, BRW_REGISTER_TYPE_B
), 0, 16, 2 );
1233 static INLINE
struct brw_reg
odd_bytes( struct brw_reg reg
)
1235 return stride( suboffset( retype( reg
, BRW_REGISTER_TYPE_B
), 1 ),
1239 /* One-, two- and three-dimensional Perlin noise, similar to the description
1240 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1241 static void noise1_sub( struct brw_wm_compile
*c
) {
1243 struct brw_compile
*p
= &c
->func
;
1244 struct brw_reg param
,
1245 x0
, x1
, /* gradients at each end */
1246 t
, tmp
[ 2 ], /* float temporaries */
1247 itmp
[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1249 int mark
= mark_tmps( c
);
1251 x0
= alloc_tmp( c
);
1252 x1
= alloc_tmp( c
);
1254 tmp
[ 0 ] = alloc_tmp( c
);
1255 tmp
[ 1 ] = alloc_tmp( c
);
1256 itmp
[ 0 ] = retype( tmp
[ 0 ], BRW_REGISTER_TYPE_UD
);
1257 itmp
[ 1 ] = retype( tmp
[ 1 ], BRW_REGISTER_TYPE_UD
);
1258 itmp
[ 2 ] = retype( x0
, BRW_REGISTER_TYPE_UD
);
1259 itmp
[ 3 ] = retype( x1
, BRW_REGISTER_TYPE_UD
);
1260 itmp
[ 4 ] = retype( t
, BRW_REGISTER_TYPE_UD
);
1262 param
= lookup_tmp( c
, mark
- 2 );
1264 brw_set_access_mode( p
, BRW_ALIGN_1
);
1266 brw_MOV( p
, itmp
[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1268 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1269 be hashed. Also compute the remainder (offset within the unit
1270 length), interleaved to reduce register dependency penalties. */
1271 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param
);
1272 brw_FRC( p
, param
, param
);
1273 brw_ADD( p
, itmp
[ 1 ], itmp
[ 0 ], brw_imm_ud( 1 ) );
1274 brw_MOV( p
, itmp
[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1275 brw_MOV( p
, itmp
[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1277 /* We're now ready to perform the hashing. The two hashes are
1278 interleaved for performance. The hash function used is
1279 designed to rapidly achieve avalanche and require only 32x16
1280 bit multiplication, and 16-bit swizzles (which we get for
1281 free). We can't use immediate operands in the multiplies,
1282 because immediates are permitted only in src1 and the 16-bit
1283 factor is permitted only in src0. */
1284 for( i
= 0; i
< 2; i
++ )
1285 brw_MUL( p
, itmp
[ i
], itmp
[ 2 ], itmp
[ i
] );
1286 for( i
= 0; i
< 2; i
++ )
1287 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1288 high_words( itmp
[ i
] ) );
1289 for( i
= 0; i
< 2; i
++ )
1290 brw_MUL( p
, itmp
[ i
], itmp
[ 3 ], itmp
[ i
] );
1291 for( i
= 0; i
< 2; i
++ )
1292 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1293 high_words( itmp
[ i
] ) );
1294 for( i
= 0; i
< 2; i
++ )
1295 brw_MUL( p
, itmp
[ i
], itmp
[ 4 ], itmp
[ i
] );
1296 for( i
= 0; i
< 2; i
++ )
1297 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1298 high_words( itmp
[ i
] ) );
1300 /* Now we want to initialise the two gradients based on the
1301 hashes. Format conversion from signed integer to float leaves
1302 everything scaled too high by a factor of pow( 2, 31 ), but
1303 we correct for that right at the end. */
1304 brw_ADD( p
, t
, param
, brw_imm_f( -1.0 ) );
1305 brw_MOV( p
, x0
, retype( tmp
[ 0 ], BRW_REGISTER_TYPE_D
) );
1306 brw_MOV( p
, x1
, retype( tmp
[ 1 ], BRW_REGISTER_TYPE_D
) );
1308 brw_MUL( p
, x0
, x0
, param
);
1309 brw_MUL( p
, x1
, x1
, t
);
1311 /* We interpolate between the gradients using the polynomial
1312 6t^5 - 15t^4 + 10t^3 (Perlin). */
1313 brw_MUL( p
, tmp
[ 0 ], param
, brw_imm_f( 6.0 ) );
1314 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( -15.0 ) );
1315 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
1316 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( 10.0 ) );
1317 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
1318 brw_ADD( p
, x1
, x1
, negate( x0
) ); /* unrelated work to fill the
1320 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param
);
1321 brw_MUL( p
, param
, tmp
[ 0 ], param
);
1322 brw_MUL( p
, x1
, x1
, param
);
1323 brw_ADD( p
, x0
, x0
, x1
);
1324 /* scale by pow( 2, -30 ), to compensate for the format conversion
1325 above and an extra factor of 2 so that a single gradient covers
1327 brw_MUL( p
, param
, x0
, brw_imm_f( 0.000000000931322574615478515625 ) );
1329 release_tmps( c
, mark
);
1332 static void emit_noise1( struct brw_wm_compile
*c
,
1333 const struct prog_instruction
*inst
)
1335 struct brw_compile
*p
= &c
->func
;
1336 struct brw_reg src
, param
, dst
;
1337 GLuint mask
= inst
->DstReg
.WriteMask
;
1339 int mark
= mark_tmps( c
);
1341 assert( mark
== 0 );
1343 src
= get_src_reg( c
, inst
, 0, 0 );
1345 param
= alloc_tmp( c
);
1347 brw_MOV( p
, param
, src
);
1349 invoke_subroutine( c
, SUB_NOISE1
, noise1_sub
);
1351 /* Fill in the result: */
1352 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1353 for (i
= 0 ; i
< 4; i
++) {
1354 if (mask
& (1<<i
)) {
1355 dst
= get_dst_reg(c
, inst
, i
);
1356 brw_MOV( p
, dst
, param
);
1359 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1360 brw_set_saturate( p
, 0 );
1362 release_tmps( c
, mark
);
1365 static void noise2_sub( struct brw_wm_compile
*c
) {
1367 struct brw_compile
*p
= &c
->func
;
1368 struct brw_reg param0
, param1
,
1369 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at each corner */
1370 t
, tmp
[ 4 ], /* float temporaries */
1371 itmp
[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1373 int mark
= mark_tmps( c
);
1375 x0y0
= alloc_tmp( c
);
1376 x0y1
= alloc_tmp( c
);
1377 x1y0
= alloc_tmp( c
);
1378 x1y1
= alloc_tmp( c
);
1380 for( i
= 0; i
< 4; i
++ ) {
1381 tmp
[ i
] = alloc_tmp( c
);
1382 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1384 itmp
[ 4 ] = retype( x0y0
, BRW_REGISTER_TYPE_UD
);
1385 itmp
[ 5 ] = retype( x0y1
, BRW_REGISTER_TYPE_UD
);
1386 itmp
[ 6 ] = retype( x1y0
, BRW_REGISTER_TYPE_UD
);
1388 param0
= lookup_tmp( c
, mark
- 3 );
1389 param1
= lookup_tmp( c
, mark
- 2 );
1391 brw_set_access_mode( p
, BRW_ALIGN_1
);
1393 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1394 be hashed. Also compute the remainders (offsets within the unit
1395 square), interleaved to reduce register dependency penalties. */
1396 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param0
);
1397 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param1
);
1398 brw_FRC( p
, param0
, param0
);
1399 brw_FRC( p
, param1
, param1
);
1400 brw_MOV( p
, itmp
[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1401 brw_ADD( p
, high_words( itmp
[ 0 ] ), high_words( itmp
[ 0 ] ),
1402 low_words( itmp
[ 1 ] ) );
1403 brw_MOV( p
, itmp
[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1404 brw_MOV( p
, itmp
[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1405 brw_ADD( p
, itmp
[ 1 ], itmp
[ 0 ], brw_imm_ud( 0x10000 ) );
1406 brw_ADD( p
, itmp
[ 2 ], itmp
[ 0 ], brw_imm_ud( 0x1 ) );
1407 brw_ADD( p
, itmp
[ 3 ], itmp
[ 0 ], brw_imm_ud( 0x10001 ) );
1409 /* We're now ready to perform the hashing. The four hashes are
1410 interleaved for performance. The hash function used is
1411 designed to rapidly achieve avalanche and require only 32x16
1412 bit multiplication, and 16-bit swizzles (which we get for
1413 free). We can't use immediate operands in the multiplies,
1414 because immediates are permitted only in src1 and the 16-bit
1415 factor is permitted only in src0. */
1416 for( i
= 0; i
< 4; i
++ )
1417 brw_MUL( p
, itmp
[ i
], itmp
[ 4 ], itmp
[ i
] );
1418 for( i
= 0; i
< 4; i
++ )
1419 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1420 high_words( itmp
[ i
] ) );
1421 for( i
= 0; i
< 4; i
++ )
1422 brw_MUL( p
, itmp
[ i
], itmp
[ 5 ], itmp
[ i
] );
1423 for( i
= 0; i
< 4; i
++ )
1424 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1425 high_words( itmp
[ i
] ) );
1426 for( i
= 0; i
< 4; i
++ )
1427 brw_MUL( p
, itmp
[ i
], itmp
[ 6 ], itmp
[ i
] );
1428 for( i
= 0; i
< 4; i
++ )
1429 brw_XOR( p
, low_words( itmp
[ i
] ), low_words( itmp
[ i
] ),
1430 high_words( itmp
[ i
] ) );
1432 /* Now we want to initialise the four gradients based on the
1433 hashes. Format conversion from signed integer to float leaves
1434 everything scaled too high by a factor of pow( 2, 15 ), but
1435 we correct for that right at the end. */
1436 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1437 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1438 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1439 brw_MOV( p
, x1y0
, low_words( tmp
[ 2 ] ) );
1440 brw_MOV( p
, x1y1
, low_words( tmp
[ 3 ] ) );
1442 brw_MOV( p
, tmp
[ 0 ], high_words( tmp
[ 0 ] ) );
1443 brw_MOV( p
, tmp
[ 1 ], high_words( tmp
[ 1 ] ) );
1444 brw_MOV( p
, tmp
[ 2 ], high_words( tmp
[ 2 ] ) );
1445 brw_MOV( p
, tmp
[ 3 ], high_words( tmp
[ 3 ] ) );
1447 brw_MUL( p
, x1y0
, x1y0
, t
);
1448 brw_MUL( p
, x1y1
, x1y1
, t
);
1449 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1450 brw_MUL( p
, x0y0
, x0y0
, param0
);
1451 brw_MUL( p
, x0y1
, x0y1
, param0
);
1453 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param1
);
1454 brw_MUL( p
, tmp
[ 2 ], tmp
[ 2 ], param1
);
1455 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], t
);
1456 brw_MUL( p
, tmp
[ 3 ], tmp
[ 3 ], t
);
1458 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 0 ] );
1459 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 2 ] );
1460 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 1 ] );
1461 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 3 ] );
1463 /* We interpolate between the gradients using the polynomial
1464 6t^5 - 15t^4 + 10t^3 (Perlin). */
1465 brw_MUL( p
, tmp
[ 0 ], param0
, brw_imm_f( 6.0 ) );
1466 brw_MUL( p
, tmp
[ 1 ], param1
, brw_imm_f( 6.0 ) );
1467 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( -15.0 ) );
1468 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], brw_imm_f( -15.0 ) );
1469 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
1470 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
1471 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) ); /* unrelated work to fill the
1473 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], brw_imm_f( 10.0 ) );
1474 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], brw_imm_f( 10.0 ) );
1475 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
1476 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
1477 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) ); /* unrelated work to fill the
1479 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], param0
);
1480 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], param1
);
1481 brw_MUL( p
, param0
, tmp
[ 0 ], param0
);
1482 brw_MUL( p
, param1
, tmp
[ 1 ], param1
);
1484 /* Here we interpolate in the y dimension... */
1485 brw_MUL( p
, x0y1
, x0y1
, param1
);
1486 brw_MUL( p
, x1y1
, x1y1
, param1
);
1487 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1488 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1490 /* And now in x. There are horrible register dependencies here,
1491 but we have nothing else to do. */
1492 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1493 brw_MUL( p
, x1y0
, x1y0
, param0
);
1494 brw_ADD( p
, x0y0
, x0y0
, x1y0
);
1496 /* scale by pow( 2, -15 ), as described above */
1497 brw_MUL( p
, param0
, x0y0
, brw_imm_f( 0.000030517578125 ) );
1499 release_tmps( c
, mark
);
1502 static void emit_noise2( struct brw_wm_compile
*c
,
1503 const struct prog_instruction
*inst
)
1505 struct brw_compile
*p
= &c
->func
;
1506 struct brw_reg src0
, src1
, param0
, param1
, dst
;
1507 GLuint mask
= inst
->DstReg
.WriteMask
;
1509 int mark
= mark_tmps( c
);
1511 assert( mark
== 0 );
1513 src0
= get_src_reg( c
, inst
, 0, 0 );
1514 src1
= get_src_reg( c
, inst
, 0, 1 );
1516 param0
= alloc_tmp( c
);
1517 param1
= alloc_tmp( c
);
1519 brw_MOV( p
, param0
, src0
);
1520 brw_MOV( p
, param1
, src1
);
1522 invoke_subroutine( c
, SUB_NOISE2
, noise2_sub
);
1524 /* Fill in the result: */
1525 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1526 for (i
= 0 ; i
< 4; i
++) {
1527 if (mask
& (1<<i
)) {
1528 dst
= get_dst_reg(c
, inst
, i
);
1529 brw_MOV( p
, dst
, param0
);
1532 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1533 brw_set_saturate( p
, 0 );
1535 release_tmps( c
, mark
);
1539 * The three-dimensional case is much like the one- and two- versions above,
1540 * but since the number of corners is rapidly growing we now pack 16 16-bit
1541 * hashes into each register to extract more parallelism from the EUs.
1543 static void noise3_sub( struct brw_wm_compile
*c
) {
1545 struct brw_compile
*p
= &c
->func
;
1546 struct brw_reg param0
, param1
, param2
,
1547 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at four of the corners */
1548 xi
, yi
, zi
, /* interpolation coefficients */
1549 t
, tmp
[ 8 ], /* float temporaries */
1550 itmp
[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1551 wtmp
[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1553 int mark
= mark_tmps( c
);
1555 x0y0
= alloc_tmp( c
);
1556 x0y1
= alloc_tmp( c
);
1557 x1y0
= alloc_tmp( c
);
1558 x1y1
= alloc_tmp( c
);
1559 xi
= alloc_tmp( c
);
1560 yi
= alloc_tmp( c
);
1561 zi
= alloc_tmp( c
);
1563 for( i
= 0; i
< 8; i
++ ) {
1564 tmp
[ i
] = alloc_tmp( c
);
1565 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1566 wtmp
[ i
] = brw_uw16_grf( tmp
[ i
].nr
, 0 );
1569 param0
= lookup_tmp( c
, mark
- 4 );
1570 param1
= lookup_tmp( c
, mark
- 3 );
1571 param2
= lookup_tmp( c
, mark
- 2 );
1573 brw_set_access_mode( p
, BRW_ALIGN_1
);
1575 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1576 be hashed. Also compute the remainders (offsets within the unit
1577 cube), interleaved to reduce register dependency penalties. */
1578 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param0
);
1579 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param1
);
1580 brw_RNDD( p
, retype( itmp
[ 2 ], BRW_REGISTER_TYPE_D
), param2
);
1581 brw_FRC( p
, param0
, param0
);
1582 brw_FRC( p
, param1
, param1
);
1583 brw_FRC( p
, param2
, param2
);
1584 /* Since we now have only 16 bits of precision in the hash, we must
1585 be more careful about thorough mixing to maintain entropy as we
1586 squash the input vector into a small scalar. */
1587 brw_MUL( p
, brw_null_reg(), low_words( itmp
[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1588 brw_MAC( p
, brw_null_reg(), low_words( itmp
[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1589 brw_MAC( p
, low_words( itmp
[ 0 ] ), low_words( itmp
[ 2 ] ),
1590 brw_imm_uw( 0x9B93 ) );
1591 brw_ADD( p
, high_words( itmp
[ 0 ] ), low_words( itmp
[ 0 ] ),
1592 brw_imm_uw( 0xBC8F ) );
1594 /* Temporarily disable the execution mask while we work with ExecSize=16
1595 channels (the mask is set for ExecSize=8 and is probably incorrect).
1596 Although this might cause execution of unwanted channels, the code
1597 writes only to temporary registers and has no side effects, so
1598 disabling the mask is harmless. */
1599 brw_push_insn_state( p
);
1600 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1601 brw_ADD( p
, wtmp
[ 1 ], wtmp
[ 0 ], brw_imm_uw( 0xD0BD ) );
1602 brw_ADD( p
, wtmp
[ 2 ], wtmp
[ 0 ], brw_imm_uw( 0x9B93 ) );
1603 brw_ADD( p
, wtmp
[ 3 ], wtmp
[ 1 ], brw_imm_uw( 0x9B93 ) );
1605 /* We're now ready to perform the hashing. The eight hashes are
1606 interleaved for performance. The hash function used is
1607 designed to rapidly achieve avalanche and require only 16x16
1608 bit multiplication, and 8-bit swizzles (which we get for
1610 for( i
= 0; i
< 4; i
++ )
1611 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0x28D9 ) );
1612 for( i
= 0; i
< 4; i
++ )
1613 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1614 odd_bytes( wtmp
[ i
] ) );
1615 for( i
= 0; i
< 4; i
++ )
1616 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0xC6D5 ) );
1617 for( i
= 0; i
< 4; i
++ )
1618 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1619 odd_bytes( wtmp
[ i
] ) );
1620 brw_pop_insn_state( p
);
1622 /* Now we want to initialise the four rear gradients based on the
1623 hashes. Format conversion from signed integer to float leaves
1624 everything scaled too high by a factor of pow( 2, 15 ), but
1625 we correct for that right at the end. */
1627 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1628 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1629 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1630 brw_MOV( p
, x1y0
, high_words( tmp
[ 0 ] ) );
1631 brw_MOV( p
, x1y1
, high_words( tmp
[ 1 ] ) );
1633 brw_push_insn_state( p
);
1634 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1635 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 5 ) );
1636 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 5 ) );
1637 brw_pop_insn_state( p
);
1639 brw_MUL( p
, x1y0
, x1y0
, t
);
1640 brw_MUL( p
, x1y1
, x1y1
, t
);
1641 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1642 brw_MUL( p
, x0y0
, x0y0
, param0
);
1643 brw_MUL( p
, x0y1
, x0y1
, param0
);
1646 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1647 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1648 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1649 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1651 brw_push_insn_state( p
);
1652 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1653 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 5 ) );
1654 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 5 ) );
1655 brw_pop_insn_state( p
);
1657 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1658 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1659 brw_ADD( p
, t
, param0
, brw_imm_f( -1.0 ) );
1660 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param1
);
1661 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param1
);
1663 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1664 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1665 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1666 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1669 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
1670 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1671 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
1672 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
1674 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param2
);
1675 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], param2
);
1676 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param2
);
1677 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], param2
);
1679 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1680 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1681 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1682 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1684 /* We interpolate between the gradients using the polynomial
1685 6t^5 - 15t^4 + 10t^3 (Perlin). */
1686 brw_MUL( p
, xi
, param0
, brw_imm_f( 6.0 ) );
1687 brw_MUL( p
, yi
, param1
, brw_imm_f( 6.0 ) );
1688 brw_MUL( p
, zi
, param2
, brw_imm_f( 6.0 ) );
1689 brw_ADD( p
, xi
, xi
, brw_imm_f( -15.0 ) );
1690 brw_ADD( p
, yi
, yi
, brw_imm_f( -15.0 ) );
1691 brw_ADD( p
, zi
, zi
, brw_imm_f( -15.0 ) );
1692 brw_MUL( p
, xi
, xi
, param0
);
1693 brw_MUL( p
, yi
, yi
, param1
);
1694 brw_MUL( p
, zi
, zi
, param2
);
1695 brw_ADD( p
, xi
, xi
, brw_imm_f( 10.0 ) );
1696 brw_ADD( p
, yi
, yi
, brw_imm_f( 10.0 ) );
1697 brw_ADD( p
, zi
, zi
, brw_imm_f( 10.0 ) );
1698 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) ); /* unrelated work */
1699 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) ); /* unrelated work */
1700 brw_MUL( p
, xi
, xi
, param0
);
1701 brw_MUL( p
, yi
, yi
, param1
);
1702 brw_MUL( p
, zi
, zi
, param2
);
1703 brw_MUL( p
, xi
, xi
, param0
);
1704 brw_MUL( p
, yi
, yi
, param1
);
1705 brw_MUL( p
, zi
, zi
, param2
);
1706 brw_MUL( p
, xi
, xi
, param0
);
1707 brw_MUL( p
, yi
, yi
, param1
);
1708 brw_MUL( p
, zi
, zi
, param2
);
1710 /* Here we interpolate in the y dimension... */
1711 brw_MUL( p
, x0y1
, x0y1
, yi
);
1712 brw_MUL( p
, x1y1
, x1y1
, yi
);
1713 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1714 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1716 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1717 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1718 brw_MUL( p
, x1y0
, x1y0
, xi
);
1719 brw_ADD( p
, tmp
[ 0 ], x0y0
, x1y0
);
1721 /* Now do the same thing for the front four gradients... */
1723 brw_MOV( p
, x0y0
, low_words( tmp
[ 2 ] ) );
1724 brw_MOV( p
, x0y1
, low_words( tmp
[ 3 ] ) );
1725 brw_MOV( p
, x1y0
, high_words( tmp
[ 2 ] ) );
1726 brw_MOV( p
, x1y1
, high_words( tmp
[ 3 ] ) );
1728 brw_push_insn_state( p
);
1729 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1730 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 5 ) );
1731 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 5 ) );
1732 brw_pop_insn_state( p
);
1734 brw_MUL( p
, x1y0
, x1y0
, t
);
1735 brw_MUL( p
, x1y1
, x1y1
, t
);
1736 brw_ADD( p
, t
, param1
, brw_imm_f( -1.0 ) );
1737 brw_MUL( p
, x0y0
, x0y0
, param0
);
1738 brw_MUL( p
, x0y1
, x0y1
, param0
);
1741 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1742 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1743 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1744 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1746 brw_push_insn_state( p
);
1747 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1748 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 5 ) );
1749 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 5 ) );
1750 brw_pop_insn_state( p
);
1752 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1753 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1754 brw_ADD( p
, t
, param2
, brw_imm_f( -1.0 ) );
1755 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param1
);
1756 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param1
);
1758 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1759 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1760 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1761 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1764 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
1765 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
1766 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
1767 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
1769 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
1770 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
1771 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
1772 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
1774 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
1775 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
1776 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
1777 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
1779 /* The interpolation coefficients are still around from last time, so
1780 again interpolate in the y dimension... */
1781 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
1782 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
1783 brw_MUL( p
, x0y1
, x0y1
, yi
);
1784 brw_MUL( p
, x1y1
, x1y1
, yi
);
1785 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
1786 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
1788 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1789 time put the front face in tmp[ 1 ] and we're nearly there... */
1790 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
1791 brw_MUL( p
, x1y0
, x1y0
, xi
);
1792 brw_ADD( p
, tmp
[ 1 ], x0y0
, x1y0
);
1794 /* The final interpolation, in the z dimension: */
1795 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], negate( tmp
[ 0 ] ) );
1796 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], zi
);
1797 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], tmp
[ 1 ] );
1799 /* scale by pow( 2, -15 ), as described above */
1800 brw_MUL( p
, param0
, tmp
[ 0 ], brw_imm_f( 0.000030517578125 ) );
1802 release_tmps( c
, mark
);
1805 static void emit_noise3( struct brw_wm_compile
*c
,
1806 const struct prog_instruction
*inst
)
1808 struct brw_compile
*p
= &c
->func
;
1809 struct brw_reg src0
, src1
, src2
, param0
, param1
, param2
, dst
;
1810 GLuint mask
= inst
->DstReg
.WriteMask
;
1812 int mark
= mark_tmps( c
);
1814 assert( mark
== 0 );
1816 src0
= get_src_reg( c
, inst
, 0, 0 );
1817 src1
= get_src_reg( c
, inst
, 0, 1 );
1818 src2
= get_src_reg( c
, inst
, 0, 2 );
1820 param0
= alloc_tmp( c
);
1821 param1
= alloc_tmp( c
);
1822 param2
= alloc_tmp( c
);
1824 brw_MOV( p
, param0
, src0
);
1825 brw_MOV( p
, param1
, src1
);
1826 brw_MOV( p
, param2
, src2
);
1828 invoke_subroutine( c
, SUB_NOISE3
, noise3_sub
);
1830 /* Fill in the result: */
1831 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
1832 for (i
= 0 ; i
< 4; i
++) {
1833 if (mask
& (1<<i
)) {
1834 dst
= get_dst_reg(c
, inst
, i
);
1835 brw_MOV( p
, dst
, param0
);
1838 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
1839 brw_set_saturate( p
, 0 );
1841 release_tmps( c
, mark
);
1845 * For the four-dimensional case, the little micro-optimisation benefits
1846 * we obtain by unrolling all the loops aren't worth the massive bloat it
1847 * now causes. Instead, we loop twice around performing a similar operation
1848 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1849 * code to glue it all together.
1851 static void noise4_sub( struct brw_wm_compile
*c
)
1853 struct brw_compile
*p
= &c
->func
;
1854 struct brw_reg param
[ 4 ],
1855 x0y0
, x0y1
, x1y0
, x1y1
, /* gradients at four of the corners */
1856 w0
, /* noise for the w=0 cube */
1857 floors
[ 2 ], /* integer coordinates of base corner of hypercube */
1858 interp
[ 4 ], /* interpolation coefficients */
1859 t
, tmp
[ 8 ], /* float temporaries */
1860 itmp
[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1861 wtmp
[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1863 int mark
= mark_tmps( c
);
1864 GLuint loop
, origin
;
1866 x0y0
= alloc_tmp( c
);
1867 x0y1
= alloc_tmp( c
);
1868 x1y0
= alloc_tmp( c
);
1869 x1y1
= alloc_tmp( c
);
1871 w0
= alloc_tmp( c
);
1872 floors
[ 0 ] = retype( alloc_tmp( c
), BRW_REGISTER_TYPE_UD
);
1873 floors
[ 1 ] = retype( alloc_tmp( c
), BRW_REGISTER_TYPE_UD
);
1875 for( i
= 0; i
< 4; i
++ ) {
1876 param
[ i
] = lookup_tmp( c
, mark
- 5 + i
);
1877 interp
[ i
] = alloc_tmp( c
);
1880 for( i
= 0; i
< 8; i
++ ) {
1881 tmp
[ i
] = alloc_tmp( c
);
1882 itmp
[ i
] = retype( tmp
[ i
], BRW_REGISTER_TYPE_UD
);
1883 wtmp
[ i
] = brw_uw16_grf( tmp
[ i
].nr
, 0 );
1886 brw_set_access_mode( p
, BRW_ALIGN_1
);
1888 /* We only want 16 bits of precision from the integral part of each
1889 co-ordinate, but unfortunately the RNDD semantics would saturate
1890 at 16 bits if we performed the operation directly to a 16-bit
1891 destination. Therefore, we round to 32-bit temporaries where
1892 appropriate, and then store only the lower 16 bits. */
1893 brw_RNDD( p
, retype( floors
[ 0 ], BRW_REGISTER_TYPE_D
), param
[ 0 ] );
1894 brw_RNDD( p
, retype( itmp
[ 0 ], BRW_REGISTER_TYPE_D
), param
[ 1 ] );
1895 brw_RNDD( p
, retype( floors
[ 1 ], BRW_REGISTER_TYPE_D
), param
[ 2 ] );
1896 brw_RNDD( p
, retype( itmp
[ 1 ], BRW_REGISTER_TYPE_D
), param
[ 3 ] );
1897 brw_MOV( p
, high_words( floors
[ 0 ] ), low_words( itmp
[ 0 ] ) );
1898 brw_MOV( p
, high_words( floors
[ 1 ] ), low_words( itmp
[ 1 ] ) );
1900 /* Modify the flag register here, because the side effect is useful
1901 later (see below). We know for certain that all flags will be
1902 cleared, since the FRC instruction cannot possibly generate
1903 negative results. Even for exceptional inputs (infinities, denormals,
1904 NaNs), the architecture guarantees that the L conditional is false. */
1905 brw_set_conditionalmod( p
, BRW_CONDITIONAL_L
);
1906 brw_FRC( p
, param
[ 0 ], param
[ 0 ] );
1907 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
1908 for( i
= 1; i
< 4; i
++ )
1909 brw_FRC( p
, param
[ i
], param
[ i
] );
1911 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1913 for( i
= 0; i
< 4; i
++ )
1914 brw_MUL( p
, interp
[ i
], param
[ i
], brw_imm_f( 6.0 ) );
1915 for( i
= 0; i
< 4; i
++ )
1916 brw_ADD( p
, interp
[ i
], interp
[ i
], brw_imm_f( -15.0 ) );
1917 for( i
= 0; i
< 4; i
++ )
1918 brw_MUL( p
, interp
[ i
], interp
[ i
], param
[ i
] );
1919 for( i
= 0; i
< 4; i
++ )
1920 brw_ADD( p
, interp
[ i
], interp
[ i
], brw_imm_f( 10.0 ) );
1921 for( j
= 0; j
< 3; j
++ )
1922 for( i
= 0; i
< 4; i
++ )
1923 brw_MUL( p
, interp
[ i
], interp
[ i
], param
[ i
] );
1925 /* Mark the current address, as it will be a jump destination. The
1926 following code will be executed twice: first, with the flag
1927 register clear indicating the w=0 case, and second with flags
1931 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1932 be hashed. Since we have only 16 bits of precision in the hash, we
1933 must be careful about thorough mixing to maintain entropy as we
1934 squash the input vector into a small scalar. */
1935 brw_MUL( p
, brw_null_reg(), low_words( floors
[ 0 ] ),
1936 brw_imm_uw( 0xBC8F ) );
1937 brw_MAC( p
, brw_null_reg(), high_words( floors
[ 0 ] ),
1938 brw_imm_uw( 0xD0BD ) );
1939 brw_MAC( p
, brw_null_reg(), low_words( floors
[ 1 ] ),
1940 brw_imm_uw( 0x9B93 ) );
1941 brw_MAC( p
, low_words( itmp
[ 0 ] ), high_words( floors
[ 1 ] ),
1942 brw_imm_uw( 0xA359 ) );
1943 brw_ADD( p
, high_words( itmp
[ 0 ] ), low_words( itmp
[ 0 ] ),
1944 brw_imm_uw( 0xBC8F ) );
1946 /* Temporarily disable the execution mask while we work with ExecSize=16
1947 channels (the mask is set for ExecSize=8 and is probably incorrect).
1948 Although this might cause execution of unwanted channels, the code
1949 writes only to temporary registers and has no side effects, so
1950 disabling the mask is harmless. */
1951 brw_push_insn_state( p
);
1952 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1953 brw_ADD( p
, wtmp
[ 1 ], wtmp
[ 0 ], brw_imm_uw( 0xD0BD ) );
1954 brw_ADD( p
, wtmp
[ 2 ], wtmp
[ 0 ], brw_imm_uw( 0x9B93 ) );
1955 brw_ADD( p
, wtmp
[ 3 ], wtmp
[ 1 ], brw_imm_uw( 0x9B93 ) );
1957 /* We're now ready to perform the hashing. The eight hashes are
1958 interleaved for performance. The hash function used is
1959 designed to rapidly achieve avalanche and require only 16x16
1960 bit multiplication, and 8-bit swizzles (which we get for
1962 for( i
= 0; i
< 4; i
++ )
1963 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0x28D9 ) );
1964 for( i
= 0; i
< 4; i
++ )
1965 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1966 odd_bytes( wtmp
[ i
] ) );
1967 for( i
= 0; i
< 4; i
++ )
1968 brw_MUL( p
, wtmp
[ i
], wtmp
[ i
], brw_imm_uw( 0xC6D5 ) );
1969 for( i
= 0; i
< 4; i
++ )
1970 brw_XOR( p
, even_bytes( wtmp
[ i
] ), even_bytes( wtmp
[ i
] ),
1971 odd_bytes( wtmp
[ i
] ) );
1972 brw_pop_insn_state( p
);
1974 /* Now we want to initialise the four rear gradients based on the
1975 hashes. Format conversion from signed integer to float leaves
1976 everything scaled too high by a factor of pow( 2, 15 ), but
1977 we correct for that right at the end. */
1979 brw_ADD( p
, t
, param
[ 0 ], brw_imm_f( -1.0 ) );
1980 brw_MOV( p
, x0y0
, low_words( tmp
[ 0 ] ) );
1981 brw_MOV( p
, x0y1
, low_words( tmp
[ 1 ] ) );
1982 brw_MOV( p
, x1y0
, high_words( tmp
[ 0 ] ) );
1983 brw_MOV( p
, x1y1
, high_words( tmp
[ 1 ] ) );
1985 brw_push_insn_state( p
);
1986 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
1987 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
1988 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
1989 brw_pop_insn_state( p
);
1991 brw_MUL( p
, x1y0
, x1y0
, t
);
1992 brw_MUL( p
, x1y1
, x1y1
, t
);
1993 brw_ADD( p
, t
, param
[ 1 ], brw_imm_f( -1.0 ) );
1994 brw_MUL( p
, x0y0
, x0y0
, param
[ 0 ] );
1995 brw_MUL( p
, x0y1
, x0y1
, param
[ 0 ] );
1998 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
1999 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
2000 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
2001 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
2003 brw_push_insn_state( p
);
2004 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
2005 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
2006 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
2007 brw_pop_insn_state( p
);
2009 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
2010 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
2011 /* prepare t for the w component (used below): w the first time through
2012 the loop; w - 1 the second time) */
2013 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
2014 brw_ADD( p
, t
, param
[ 3 ], brw_imm_f( -1.0 ) );
2015 p
->current
->header
.predicate_inverse
= 1;
2016 brw_MOV( p
, t
, param
[ 3 ] );
2017 p
->current
->header
.predicate_inverse
= 0;
2018 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
2019 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 1 ] );
2020 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 1 ] );
2022 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
2023 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
2024 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
2025 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
2028 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
2029 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
2030 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
2031 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
2033 brw_push_insn_state( p
);
2034 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
2035 brw_SHL( p
, wtmp
[ 0 ], wtmp
[ 0 ], brw_imm_uw( 4 ) );
2036 brw_SHL( p
, wtmp
[ 1 ], wtmp
[ 1 ], brw_imm_uw( 4 ) );
2037 brw_pop_insn_state( p
);
2039 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 2 ] );
2040 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], param
[ 2 ] );
2041 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 2 ] );
2042 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], param
[ 2 ] );
2044 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
2045 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
2046 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
2047 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
2050 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 0 ] ) );
2051 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 1 ] ) );
2052 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 0 ] ) );
2053 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 1 ] ) );
2055 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
2056 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
2057 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
2058 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
2059 brw_ADD( p
, t
, param
[ 0 ], brw_imm_f( -1.0 ) );
2061 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
2062 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
2063 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
2064 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
2066 /* Here we interpolate in the y dimension... */
2067 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
2068 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
2069 brw_MUL( p
, x0y1
, x0y1
, interp
[ 1 ] );
2070 brw_MUL( p
, x1y1
, x1y1
, interp
[ 1 ] );
2071 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
2072 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
2074 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2075 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
2076 brw_MUL( p
, x1y0
, x1y0
, interp
[ 0 ] );
2077 brw_ADD( p
, tmp
[ 0 ], x0y0
, x1y0
);
2079 /* Now do the same thing for the front four gradients... */
2081 brw_MOV( p
, x0y0
, low_words( tmp
[ 2 ] ) );
2082 brw_MOV( p
, x0y1
, low_words( tmp
[ 3 ] ) );
2083 brw_MOV( p
, x1y0
, high_words( tmp
[ 2 ] ) );
2084 brw_MOV( p
, x1y1
, high_words( tmp
[ 3 ] ) );
2086 brw_push_insn_state( p
);
2087 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
2088 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
2089 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
2090 brw_pop_insn_state( p
);
2092 brw_MUL( p
, x1y0
, x1y0
, t
);
2093 brw_MUL( p
, x1y1
, x1y1
, t
);
2094 brw_ADD( p
, t
, param
[ 1 ], brw_imm_f( -1.0 ) );
2095 brw_MUL( p
, x0y0
, x0y0
, param
[ 0 ] );
2096 brw_MUL( p
, x0y1
, x0y1
, param
[ 0 ] );
2099 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
2100 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
2101 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
2102 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
2104 brw_push_insn_state( p
);
2105 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
2106 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
2107 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
2108 brw_pop_insn_state( p
);
2110 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
2111 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
2112 brw_ADD( p
, t
, param
[ 2 ], brw_imm_f( -1.0 ) );
2113 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], param
[ 1 ] );
2114 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], param
[ 1 ] );
2116 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
2117 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
2118 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
2119 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
2122 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
2123 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
2124 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
2125 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
2127 brw_push_insn_state( p
);
2128 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
2129 brw_SHL( p
, wtmp
[ 2 ], wtmp
[ 2 ], brw_imm_uw( 4 ) );
2130 brw_SHL( p
, wtmp
[ 3 ], wtmp
[ 3 ], brw_imm_uw( 4 ) );
2131 brw_pop_insn_state( p
);
2133 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
2134 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
2135 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
2136 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
2137 /* prepare t for the w component (used below): w the first time through
2138 the loop; w - 1 the second time) */
2139 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
2140 brw_ADD( p
, t
, param
[ 3 ], brw_imm_f( -1.0 ) );
2141 p
->current
->header
.predicate_inverse
= 1;
2142 brw_MOV( p
, t
, param
[ 3 ] );
2143 p
->current
->header
.predicate_inverse
= 0;
2144 brw_set_predicate_control( p
, BRW_PREDICATE_NONE
);
2146 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
2147 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
2148 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
2149 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
2152 brw_MOV( p
, tmp
[ 4 ], low_words( tmp
[ 2 ] ) );
2153 brw_MOV( p
, tmp
[ 5 ], low_words( tmp
[ 3 ] ) );
2154 brw_MOV( p
, tmp
[ 6 ], high_words( tmp
[ 2 ] ) );
2155 brw_MOV( p
, tmp
[ 7 ], high_words( tmp
[ 3 ] ) );
2157 brw_MUL( p
, tmp
[ 4 ], tmp
[ 4 ], t
);
2158 brw_MUL( p
, tmp
[ 5 ], tmp
[ 5 ], t
);
2159 brw_MUL( p
, tmp
[ 6 ], tmp
[ 6 ], t
);
2160 brw_MUL( p
, tmp
[ 7 ], tmp
[ 7 ], t
);
2162 brw_ADD( p
, x0y0
, x0y0
, tmp
[ 4 ] );
2163 brw_ADD( p
, x0y1
, x0y1
, tmp
[ 5 ] );
2164 brw_ADD( p
, x1y0
, x1y0
, tmp
[ 6 ] );
2165 brw_ADD( p
, x1y1
, x1y1
, tmp
[ 7 ] );
2167 /* Interpolate in the y dimension: */
2168 brw_ADD( p
, x0y1
, x0y1
, negate( x0y0
) );
2169 brw_ADD( p
, x1y1
, x1y1
, negate( x1y0
) );
2170 brw_MUL( p
, x0y1
, x0y1
, interp
[ 1 ] );
2171 brw_MUL( p
, x1y1
, x1y1
, interp
[ 1 ] );
2172 brw_ADD( p
, x0y0
, x0y0
, x0y1
);
2173 brw_ADD( p
, x1y0
, x1y0
, x1y1
);
2175 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2176 time put the front face in tmp[ 1 ] and we're nearly there... */
2177 brw_ADD( p
, x1y0
, x1y0
, negate( x0y0
) );
2178 brw_MUL( p
, x1y0
, x1y0
, interp
[ 0 ] );
2179 brw_ADD( p
, tmp
[ 1 ], x0y0
, x1y0
);
2181 /* Another interpolation, in the z dimension: */
2182 brw_ADD( p
, tmp
[ 1 ], tmp
[ 1 ], negate( tmp
[ 0 ] ) );
2183 brw_MUL( p
, tmp
[ 1 ], tmp
[ 1 ], interp
[ 2 ] );
2184 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], tmp
[ 1 ] );
2186 /* Exit the loop if we've computed both cubes... */
2187 origin
= p
->nr_insn
;
2188 brw_push_insn_state( p
);
2189 brw_set_predicate_control( p
, BRW_PREDICATE_NORMAL
);
2190 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
2191 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2192 brw_pop_insn_state( p
);
2194 /* Save the result for the w=0 case, and increment the w coordinate: */
2195 brw_MOV( p
, w0
, tmp
[ 0 ] );
2196 brw_ADD( p
, high_words( floors
[ 1 ] ), high_words( floors
[ 1 ] ),
2199 /* Loop around for the other cube. Explicitly set the flag register
2200 (unfortunately we must spend an extra instruction to do this: we
2201 can't rely on a side effect of the previous MOV or ADD because
2202 conditional modifiers which are normally true might be false in
2203 exceptional circumstances, e.g. given a NaN input; the add to
2204 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2205 brw_push_insn_state( p
);
2206 brw_set_mask_control( p
, BRW_MASK_DISABLE
);
2207 brw_MOV( p
, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2208 brw_ADD( p
, brw_ip_reg(), brw_ip_reg(),
2209 brw_imm_d( ( loop
- p
->nr_insn
) << 4 ) );
2210 brw_pop_insn_state( p
);
2212 /* Patch the previous conditional branch now that we know the
2213 destination address. */
2214 brw_set_src1( p
->store
+ origin
,
2215 brw_imm_d( ( p
->nr_insn
- origin
) << 4 ) );
2217 /* The very last interpolation. */
2218 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], negate( w0
) );
2219 brw_MUL( p
, tmp
[ 0 ], tmp
[ 0 ], interp
[ 3 ] );
2220 brw_ADD( p
, tmp
[ 0 ], tmp
[ 0 ], w0
);
2222 /* scale by pow( 2, -15 ), as described above */
2223 brw_MUL( p
, param
[ 0 ], tmp
[ 0 ], brw_imm_f( 0.000030517578125 ) );
2225 release_tmps( c
, mark
);
2228 static void emit_noise4( struct brw_wm_compile
*c
,
2229 const struct prog_instruction
*inst
)
2231 struct brw_compile
*p
= &c
->func
;
2232 struct brw_reg src0
, src1
, src2
, src3
, param0
, param1
, param2
, param3
, dst
;
2233 GLuint mask
= inst
->DstReg
.WriteMask
;
2235 int mark
= mark_tmps( c
);
2237 assert( mark
== 0 );
2239 src0
= get_src_reg( c
, inst
, 0, 0 );
2240 src1
= get_src_reg( c
, inst
, 0, 1 );
2241 src2
= get_src_reg( c
, inst
, 0, 2 );
2242 src3
= get_src_reg( c
, inst
, 0, 3 );
2244 param0
= alloc_tmp( c
);
2245 param1
= alloc_tmp( c
);
2246 param2
= alloc_tmp( c
);
2247 param3
= alloc_tmp( c
);
2249 brw_MOV( p
, param0
, src0
);
2250 brw_MOV( p
, param1
, src1
);
2251 brw_MOV( p
, param2
, src2
);
2252 brw_MOV( p
, param3
, src3
);
2254 invoke_subroutine( c
, SUB_NOISE4
, noise4_sub
);
2256 /* Fill in the result: */
2257 brw_set_saturate( p
, inst
->SaturateMode
== SATURATE_ZERO_ONE
);
2258 for (i
= 0 ; i
< 4; i
++) {
2259 if (mask
& (1<<i
)) {
2260 dst
= get_dst_reg(c
, inst
, i
);
2261 brw_MOV( p
, dst
, param0
);
2264 if( inst
->SaturateMode
== SATURATE_ZERO_ONE
)
2265 brw_set_saturate( p
, 0 );
2267 release_tmps( c
, mark
);
2270 static void emit_wpos_xy(struct brw_wm_compile
*c
,
2271 const struct prog_instruction
*inst
)
2273 struct brw_compile
*p
= &c
->func
;
2274 GLuint mask
= inst
->DstReg
.WriteMask
;
2275 struct brw_reg src0
[2], dst
[2];
2277 dst
[0] = get_dst_reg(c
, inst
, 0);
2278 dst
[1] = get_dst_reg(c
, inst
, 1);
2280 src0
[0] = get_src_reg(c
, inst
, 0, 0);
2281 src0
[1] = get_src_reg(c
, inst
, 0, 1);
2283 /* Calculate the pixel offset from window bottom left into destination
2286 if (mask
& WRITEMASK_X
) {
2287 /* X' = X - origin_x */
2290 retype(src0
[0], BRW_REGISTER_TYPE_W
),
2291 brw_imm_d(0 - c
->key
.origin_x
));
2294 if (mask
& WRITEMASK_Y
) {
2295 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2298 negate(retype(src0
[1], BRW_REGISTER_TYPE_W
)),
2299 brw_imm_d(c
->key
.origin_y
+ c
->key
.drawable_height
- 1));
2304 BIAS on SIMD8 not working yet...
2306 static void emit_txb(struct brw_wm_compile
*c
,
2307 const struct prog_instruction
*inst
)
2309 struct brw_compile
*p
= &c
->func
;
2310 struct brw_reg dst
[4], src
[4], payload_reg
;
2311 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2312 const GLuint unit
= inst
->TexSrcUnit
;
2316 assert(unit
< BRW_MAX_TEX_UNIT
);
2318 payload_reg
= get_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
, 0, 1, 0, 0);
2320 for (i
= 0; i
< 4; i
++)
2321 dst
[i
] = get_dst_reg(c
, inst
, i
);
2322 for (i
= 0; i
< 4; i
++)
2323 src
[i
] = get_src_reg(c
, inst
, 0, i
);
2325 switch (inst
->TexSrcTarget
) {
2326 case TEXTURE_1D_INDEX
:
2327 brw_MOV(p
, brw_message_reg(2), src
[0]); /* s coord */
2328 brw_MOV(p
, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2329 brw_MOV(p
, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2331 case TEXTURE_2D_INDEX
:
2332 case TEXTURE_RECT_INDEX
:
2333 brw_MOV(p
, brw_message_reg(2), src
[0]);
2334 brw_MOV(p
, brw_message_reg(3), src
[1]);
2335 brw_MOV(p
, brw_message_reg(4), brw_imm_f(0));
2337 case TEXTURE_3D_INDEX
:
2338 case TEXTURE_CUBE_INDEX
:
2339 brw_MOV(p
, brw_message_reg(2), src
[0]);
2340 brw_MOV(p
, brw_message_reg(3), src
[1]);
2341 brw_MOV(p
, brw_message_reg(4), src
[2]);
2344 /* invalid target */
2347 brw_MOV(p
, brw_message_reg(5), src
[3]); /* bias */
2348 brw_MOV(p
, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2350 if (BRW_IS_IGDNG(p
->brw
)) {
2351 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG
;
2353 /* Does it work well on SIMD8? */
2354 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS
;
2358 retype(vec8(dst
[0]), BRW_REGISTER_TYPE_UW
), /* dest */
2360 retype(payload_reg
, BRW_REGISTER_TYPE_UW
), /* src0 */
2361 SURF_INDEX_TEXTURE(unit
),
2363 inst
->DstReg
.WriteMask
, /* writemask */
2364 msg_type
, /* msg_type */
2365 4, /* response_length */
2369 BRW_SAMPLER_SIMD_MODE_SIMD8
);
2373 static void emit_tex(struct brw_wm_compile
*c
,
2374 const struct prog_instruction
*inst
)
2376 struct brw_compile
*p
= &c
->func
;
2377 struct brw_reg dst
[4], src
[4], payload_reg
;
2378 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2379 const GLuint unit
= inst
->TexSrcUnit
;
2383 GLboolean shadow
= (c
->key
.shadowtex_mask
& (1<<unit
)) ? 1 : 0;
2386 assert(unit
< BRW_MAX_TEX_UNIT
);
2388 payload_reg
= get_reg(c
, PROGRAM_PAYLOAD
, PAYLOAD_DEPTH
, 0, 1, 0, 0);
2390 for (i
= 0; i
< 4; i
++)
2391 dst
[i
] = get_dst_reg(c
, inst
, i
);
2392 for (i
= 0; i
< 4; i
++)
2393 src
[i
] = get_src_reg(c
, inst
, 0, i
);
2395 switch (inst
->TexSrcTarget
) {
2396 case TEXTURE_1D_INDEX
:
2400 case TEXTURE_2D_INDEX
:
2401 case TEXTURE_RECT_INDEX
:
2402 emit
= WRITEMASK_XY
;
2405 case TEXTURE_3D_INDEX
:
2406 case TEXTURE_CUBE_INDEX
:
2407 emit
= WRITEMASK_XYZ
;
2411 /* invalid target */
2416 /* move/load S, T, R coords */
2417 for (i
= 0; i
< nr
; i
++) {
2418 static const GLuint swz
[4] = {0,1,2,2};
2420 brw_MOV(p
, brw_message_reg(msg_len
+1), src
[swz
[i
]]);
2422 brw_MOV(p
, brw_message_reg(msg_len
+1), brw_imm_f(0));
2427 brw_MOV(p
, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2428 brw_MOV(p
, brw_message_reg(6), src
[2]); /* ref value / R coord */
2431 if (BRW_IS_IGDNG(p
->brw
)) {
2433 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG
;
2435 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG
;
2437 /* Does it work for shadow on SIMD8 ? */
2438 msg_type
= BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE
;
2442 retype(vec8(dst
[0]), BRW_REGISTER_TYPE_UW
), /* dest */
2444 retype(payload_reg
, BRW_REGISTER_TYPE_UW
), /* src0 */
2445 SURF_INDEX_TEXTURE(unit
),
2447 inst
->DstReg
.WriteMask
, /* writemask */
2448 msg_type
, /* msg_type */
2449 4, /* response_length */
2450 shadow
? 6 : 4, /* msg_length */
2453 BRW_SAMPLER_SIMD_MODE_SIMD8
);
2456 brw_MOV(p
, dst
[3], brw_imm_f(1.0));
2461 * Resolve subroutine calls after code emit is done.
2463 static void post_wm_emit( struct brw_wm_compile
*c
)
2465 brw_resolve_cals(&c
->func
);
2469 get_argument_regs(struct brw_wm_compile
*c
,
2470 const struct prog_instruction
*inst
,
2472 struct brw_reg
*regs
,
2477 for (i
= 0; i
< 4; i
++) {
2478 if (mask
& (1 << i
))
2479 regs
[i
] = get_src_reg(c
, inst
, index
, i
);
2483 static void brw_wm_emit_glsl(struct brw_context
*brw
, struct brw_wm_compile
*c
)
2485 #define MAX_IF_DEPTH 32
2486 #define MAX_LOOP_DEPTH 32
2487 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
];
2488 GLuint i
, if_depth
= 0, loop_depth
= 0;
2489 struct brw_compile
*p
= &c
->func
;
2490 struct brw_indirect stack_index
= brw_indirect(0, 0);
2492 c
->out_of_regs
= GL_FALSE
;
2495 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
2496 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
2498 for (i
= 0; i
< c
->nr_fp_insns
; i
++) {
2499 const struct prog_instruction
*inst
= &c
->prog_instructions
[i
];
2501 struct brw_reg args
[3][4], dst
[4];
2507 _mesa_printf("Inst %d: ", i
);
2508 _mesa_print_instruction(inst
);
2511 /* fetch any constants that this instruction needs */
2512 if (c
->fp
->use_const_buffer
)
2513 fetch_constants(c
, inst
);
2515 if (inst
->Opcode
!= OPCODE_ARL
) {
2516 for (j
= 0; j
< 4; j
++) {
2517 if (inst
->DstReg
.WriteMask
& (1 << j
))
2518 dst
[j
] = get_dst_reg(c
, inst
, j
);
2520 dst
[j
] = brw_null_reg();
2523 for (j
= 0; j
< brw_wm_nr_args(inst
->Opcode
); j
++)
2524 get_argument_regs(c
, inst
, j
, args
[j
], WRITEMASK_XYZW
);
2526 dst_flags
= inst
->DstReg
.WriteMask
;
2527 if (inst
->SaturateMode
== SATURATE_ZERO_ONE
)
2528 dst_flags
|= SATURATE
;
2530 if (inst
->CondUpdate
)
2531 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NZ
);
2533 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
2535 dst_flags
= inst
->DstReg
.WriteMask
;
2536 if (inst
->SaturateMode
== SATURATE_ZERO_ONE
)
2537 dst_flags
|= SATURATE
;
2539 switch (inst
->Opcode
) {
2541 emit_pixel_xy(c
, inst
);
2544 emit_delta_xy(c
, inst
);
2547 emit_pixel_w(c
, inst
);
2550 emit_linterp(c
, inst
);
2553 emit_pinterp(c
, inst
);
2556 emit_cinterp(c
, inst
);
2559 emit_wpos_xy(c
, inst
);
2562 emit_fb_write(c
, inst
);
2564 case WM_FRONTFACING
:
2565 emit_frontfacing(c
, inst
);
2568 emit_alu2(p
, brw_ADD
, dst
, dst_flags
, args
[0], args
[1]);
2574 emit_alu1(p
, brw_FRC
, dst
, dst_flags
, args
[0]);
2577 emit_alu1(p
, brw_RNDD
, dst
, dst_flags
, args
[0]);
2580 unalias3(c
, emit_lrp
,
2581 dst
, dst_flags
, args
[0], args
[1], args
[2]);
2584 emit_alu1(p
, brw_RNDZ
, dst
, dst_flags
, args
[0]);
2588 emit_alu1(p
, brw_MOV
, dst
, dst_flags
, args
[0]);
2591 emit_dp3(p
, dst
, dst_flags
, args
[0], args
[1]);
2594 emit_dp4(p
, dst
, dst_flags
, args
[0], args
[1]);
2600 emit_dph(p
, dst
, dst_flags
, args
[0], args
[1]);
2622 emit_min_max(c
, inst
);
2626 emit_ddxy(p
, dst
, dst_flags
, (inst
->Opcode
== OPCODE_DDX
),
2630 emit_sop(p
, dst
, dst_flags
,
2631 BRW_CONDITIONAL_L
, args
[0], args
[1]);
2634 emit_sop(p
, dst
, dst_flags
,
2635 BRW_CONDITIONAL_LE
, args
[0], args
[1]);
2638 emit_sop(p
, dst
, dst_flags
,
2639 BRW_CONDITIONAL_G
, args
[0], args
[1]);
2642 emit_sop(p
, dst
, dst_flags
,
2643 BRW_CONDITIONAL_GE
, args
[0], args
[1]);
2646 emit_sop(p
, dst
, dst_flags
,
2647 BRW_CONDITIONAL_EQ
, args
[0], args
[1]);
2650 emit_sop(p
, dst
, dst_flags
,
2651 BRW_CONDITIONAL_NEQ
, args
[0], args
[1]);
2654 emit_alu2(p
, brw_MUL
, dst
, dst_flags
, args
[0], args
[1]);
2660 emit_mad(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
2663 emit_noise1(c
, inst
);
2666 emit_noise2(c
, inst
);
2669 emit_noise3(c
, inst
);
2672 emit_noise4(c
, inst
);
2684 assert(if_depth
< MAX_IF_DEPTH
);
2685 if_inst
[if_depth
++] = brw_IF(p
, BRW_EXECUTE_8
);
2688 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
2691 assert(if_depth
> 0);
2692 brw_ENDIF(p
, if_inst
[--if_depth
]);
2695 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2701 brw_push_insn_state(p
);
2702 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
2703 brw_set_access_mode(p
, BRW_ALIGN_1
);
2704 brw_ADD(p
, deref_1ud(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2705 brw_set_access_mode(p
, BRW_ALIGN_16
);
2706 brw_ADD(p
, get_addr_reg(stack_index
),
2707 get_addr_reg(stack_index
), brw_imm_d(4));
2708 brw_save_call(&c
->func
, inst
->Comment
, p
->nr_insn
);
2709 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2710 brw_pop_insn_state(p
);
2714 brw_push_insn_state(p
);
2715 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
2716 brw_ADD(p
, get_addr_reg(stack_index
),
2717 get_addr_reg(stack_index
), brw_imm_d(-4));
2718 brw_set_access_mode(p
, BRW_ALIGN_1
);
2719 brw_MOV(p
, brw_ip_reg(), deref_1ud(stack_index
, 0));
2720 brw_set_access_mode(p
, BRW_ALIGN_16
);
2721 brw_pop_insn_state(p
);
2724 case OPCODE_BGNLOOP
:
2725 /* XXX may need to invalidate the current_constant regs */
2726 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2730 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2734 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2736 case OPCODE_ENDLOOP
:
2738 struct brw_instruction
*inst0
, *inst1
;
2741 if (BRW_IS_IGDNG(brw
))
2745 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2746 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2747 while (inst0
> loop_inst
[loop_depth
]) {
2749 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
) {
2750 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2751 inst0
->bits3
.if_else
.pop_count
= 0;
2753 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
) {
2754 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2755 inst0
->bits3
.if_else
.pop_count
= 0;
2761 _mesa_printf("unsupported IR in fragment shader %d\n",
2765 if (inst
->CondUpdate
)
2766 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
2768 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2772 if (INTEL_DEBUG
& DEBUG_WM
) {
2773 _mesa_printf("wm-native:\n");
2774 for (i
= 0; i
< p
->nr_insn
; i
++)
2775 brw_disasm(stderr
, &p
->store
[i
]);
2781 * Do GPU code generation for shaders that use GLSL features such as
2782 * flow control. Other shaders will be compiled with the
2784 void brw_wm_glsl_emit(struct brw_context
*brw
, struct brw_wm_compile
*c
)
2786 if (INTEL_DEBUG
& DEBUG_WM
) {
2787 _mesa_printf("brw_wm_glsl_emit:\n");
2790 /* initial instruction translation/simplification */
2793 /* actual code generation */
2794 brw_wm_emit_glsl(brw
, c
);
2796 if (INTEL_DEBUG
& DEBUG_WM
) {
2797 brw_wm_print_program(c
, "brw_wm_glsl_emit done");
2800 c
->prog_data
.total_grf
= num_grf_used(c
);
2801 c
->prog_data
.total_scratch
= 0;