4 TGSI, Tungsten Graphics Shader Instructions, is an intermediate language
5 for describing shaders. Since Gallium is inherently shaderful, shaders are
6 an important part of the API. TGSI is the only intermediate representation
10 TGSI Instruction Specification
11 ==============================
14 1 Instruction Set Operations
15 =============================
18 1.1 GL_NV_vertex_program
19 -------------------------
22 1.1.1 ARL - Address Register Load
38 1.1.3 LIT - Light Coefficients
41 dst.y = max(src.x, 0.0)
42 dst.z = (src.x > 0.0) ? pow(max(src.y, 0.0), clamp(src.w, -128.0, 128.0)) : 0.0
46 1.1.4 RCP - Reciprocal
54 1.1.5 RSQ - Reciprocal Square Root
56 dst.x = 1.0 / sqrt(abs(src.x))
57 dst.y = 1.0 / sqrt(abs(src.x))
58 dst.z = 1.0 / sqrt(abs(src.x))
59 dst.w = 1.0 / sqrt(abs(src.x))
62 1.1.6 EXP - Approximate Exponential Base 2
64 dst.x = pow(2.0, floor(src.x))
65 dst.y = src.x - floor(src.x)
66 dst.z = pow(2.0, src.x)
70 1.1.7 LOG - Approximate Logarithm Base 2
72 dst.x = floor(lg2(abs(src.x)))
73 dst.y = abs(src.x) / pow(2.0, floor(lg2(abs(src.x))))
74 dst.z = lg2(abs(src.x))
80 dst.x = src0.x * src1.x
81 dst.y = src0.y * src1.y
82 dst.z = src0.z * src1.z
83 dst.w = src0.w * src1.w
88 dst.x = src0.x + src1.x
89 dst.y = src0.y + src1.y
90 dst.z = src0.z + src1.z
91 dst.w = src0.w + src1.w
94 1.1.10 DP3 - 3-component Dot Product
96 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
97 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
98 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
99 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
102 1.1.11 DP4 - 4-component Dot Product
104 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
105 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
106 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
107 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
110 1.1.12 DST - Distance Vector
113 dst.y = src0.y * src1.y
120 dst.x = min(src0.x, src1.x)
121 dst.y = min(src0.y, src1.y)
122 dst.z = min(src0.z, src1.z)
123 dst.w = min(src0.w, src1.w)
128 dst.x = max(src0.x, src1.x)
129 dst.y = max(src0.y, src1.y)
130 dst.z = max(src0.z, src1.z)
131 dst.w = max(src0.w, src1.w)
134 1.1.15 SLT - Set On Less Than
136 dst.x = (src0.x < src1.x) ? 1.0 : 0.0
137 dst.y = (src0.y < src1.y) ? 1.0 : 0.0
138 dst.z = (src0.z < src1.z) ? 1.0 : 0.0
139 dst.w = (src0.w < src1.w) ? 1.0 : 0.0
142 1.1.16 SGE - Set On Greater Equal Than
144 dst.x = (src0.x >= src1.x) ? 1.0 : 0.0
145 dst.y = (src0.y >= src1.y) ? 1.0 : 0.0
146 dst.z = (src0.z >= src1.z) ? 1.0 : 0.0
147 dst.w = (src0.w >= src1.w) ? 1.0 : 0.0
150 1.1.17 MAD - Multiply And Add
152 dst.x = src0.x * src1.x + src2.x
153 dst.y = src0.y * src1.y + src2.y
154 dst.z = src0.z * src1.z + src2.z
155 dst.w = src0.w * src1.w + src2.w
160 dst.x = src0.x - src1.x
161 dst.y = src0.y - src1.y
162 dst.z = src0.z - src1.z
163 dst.w = src0.w - src1.w
166 1.2.4 LRP - Linear Interpolate
168 dst.x = src0.x * (src1.x - src2.x) + src2.x
169 dst.y = src0.y * (src1.y - src2.y) + src2.y
170 dst.z = src0.z * (src1.z - src2.z) + src2.z
171 dst.w = src0.w * (src1.w - src2.w) + src2.w
174 1.2.5 CND - Condition
176 dst.x = (src2.x > 0.5) ? src0.x : src1.x
177 dst.y = (src2.y > 0.5) ? src0.y : src1.y
178 dst.z = (src2.z > 0.5) ? src0.z : src1.z
179 dst.w = (src2.w > 0.5) ? src0.w : src1.w
182 1.2.7 DP2A - 2-component Dot Product And Add
184 dst.x = src0.x * src1.x + src0.y * src1.y + src2.x
185 dst.y = src0.x * src1.x + src0.y * src1.y + src2.x
186 dst.z = src0.x * src1.x + src0.y * src1.y + src2.x
187 dst.w = src0.x * src1.x + src0.y * src1.y + src2.x
190 1.3.4 FRAC - Fraction
192 dst.x = src.x - floor(src.x)
193 dst.y = src.y - floor(src.y)
194 dst.z = src.z - floor(src.z)
195 dst.w = src.w - floor(src.w)
200 dst.x = clamp(src0.x, src1.x, src2.x)
201 dst.y = clamp(src0.y, src1.y, src2.y)
202 dst.z = clamp(src0.z, src1.z, src2.z)
203 dst.w = clamp(src0.w, src1.w, src2.w)
222 1.3.10 EX2 - Exponential Base 2
224 dst.x = pow(2.0, src.x)
225 dst.y = pow(2.0, src.x)
226 dst.z = pow(2.0, src.x)
227 dst.w = pow(2.0, src.x)
230 1.3.11 LG2 - Logarithm Base 2
240 dst.x = pow(src0.x, src1.x)
241 dst.y = pow(src0.x, src1.x)
242 dst.z = pow(src0.x, src1.x)
243 dst.w = pow(src0.x, src1.x)
245 1.3.15 XPD - Cross Product
247 dst.x = src0.y * src1.z - src1.y * src0.z
248 dst.y = src0.z * src1.x - src1.z * src0.x
249 dst.z = src0.x * src1.y - src1.x * src0.y
261 1.4.2 RCC - Reciprocal Clamped
263 dst.x = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
264 dst.y = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
265 dst.z = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
266 dst.w = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
269 1.4.3 DPH - Homogeneous Dot Product
271 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
272 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
273 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
274 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
285 1.5.2 DDX - Derivative Relative To X
287 dst.x = partialx(src.x)
288 dst.y = partialx(src.y)
289 dst.z = partialx(src.z)
290 dst.w = partialx(src.w)
293 1.5.3 DDY - Derivative Relative To Y
295 dst.x = partialy(src.x)
296 dst.y = partialy(src.y)
297 dst.z = partialy(src.z)
298 dst.w = partialy(src.w)
301 1.5.7 KILP - Predicated Discard
306 1.5.10 PK2H - Pack Two 16-bit Floats
311 1.5.11 PK2US - Pack Two Unsigned 16-bit Scalars
316 1.5.12 PK4B - Pack Four Signed 8-bit Scalars
321 1.5.13 PK4UB - Pack Four Unsigned 8-bit Scalars
326 1.5.15 RFL - Reflection Vector
328 dst.x = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.x - src1.x
329 dst.y = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.y - src1.y
330 dst.z = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.z - src1.z
333 Considered for removal.
336 1.5.16 SEQ - Set On Equal
338 dst.x = (src0.x == src1.x) ? 1.0 : 0.0
339 dst.y = (src0.y == src1.y) ? 1.0 : 0.0
340 dst.z = (src0.z == src1.z) ? 1.0 : 0.0
341 dst.w = (src0.w == src1.w) ? 1.0 : 0.0
344 1.5.17 SFL - Set On False
351 Considered for removal.
353 1.5.18 SGT - Set On Greater Than
355 dst.x = (src0.x > src1.x) ? 1.0 : 0.0
356 dst.y = (src0.y > src1.y) ? 1.0 : 0.0
357 dst.z = (src0.z > src1.z) ? 1.0 : 0.0
358 dst.w = (src0.w > src1.w) ? 1.0 : 0.0
369 1.5.20 SLE - Set On Less Equal Than
371 dst.x = (src0.x <= src1.x) ? 1.0 : 0.0
372 dst.y = (src0.y <= src1.y) ? 1.0 : 0.0
373 dst.z = (src0.z <= src1.z) ? 1.0 : 0.0
374 dst.w = (src0.w <= src1.w) ? 1.0 : 0.0
377 1.5.21 SNE - Set On Not Equal
379 dst.x = (src0.x != src1.x) ? 1.0 : 0.0
380 dst.y = (src0.y != src1.y) ? 1.0 : 0.0
381 dst.z = (src0.z != src1.z) ? 1.0 : 0.0
382 dst.w = (src0.w != src1.w) ? 1.0 : 0.0
385 1.5.22 STR - Set On True
393 1.5.23 TEX - Texture Lookup
398 1.5.24 TXD - Texture Lookup with Derivatives
403 1.5.25 TXP - Projective Texture Lookup
408 1.5.26 UP2H - Unpack Two 16-Bit Floats
412 Considered for removal.
414 1.5.27 UP2US - Unpack Two Unsigned 16-Bit Scalars
418 Considered for removal.
420 1.5.28 UP4B - Unpack Four Signed 8-Bit Values
424 Considered for removal.
426 1.5.29 UP4UB - Unpack Four Unsigned 8-Bit Scalars
430 Considered for removal.
432 1.5.30 X2D - 2D Coordinate Transformation
434 dst.x = src0.x + src1.x * src2.x + src1.y * src2.y
435 dst.y = src0.y + src1.x * src2.z + src1.y * src2.w
436 dst.z = src0.x + src1.x * src2.x + src1.y * src2.y
437 dst.w = src0.y + src1.x * src2.z + src1.y * src2.w
439 Considered for removal.
442 1.6 GL_NV_vertex_program2
443 --------------------------
446 1.6.1 ARA - Address Register Add
450 Considered for removal.
452 1.6.2 ARR - Address Register Load With Round
464 Considered for removal.
466 1.6.4 CAL - Subroutine Call
472 1.6.5 RET - Subroutine Call Return
476 Potential restrictions:
477 * Only occurs at end of function.
481 dst.x = (src.x > 0.0) ? 1.0 : (src.x < 0.0) ? -1.0 : 0.0
482 dst.y = (src.y > 0.0) ? 1.0 : (src.y < 0.0) ? -1.0 : 0.0
483 dst.z = (src.z > 0.0) ? 1.0 : (src.z < 0.0) ? -1.0 : 0.0
484 dst.w = (src.w > 0.0) ? 1.0 : (src.w < 0.0) ? -1.0 : 0.0
489 dst.x = (src0.x < 0.0) ? src1.x : src2.x
490 dst.y = (src0.y < 0.0) ? src1.y : src2.y
491 dst.z = (src0.z < 0.0) ? src1.z : src2.z
492 dst.w = (src0.w < 0.0) ? src1.w : src2.w
495 1.8.2 KIL - Conditional Discard
497 if (src.x < 0.0 || src.y < 0.0 || src.z < 0.0 || src.w < 0.0)
502 1.8.3 SCS - Sine Cosine
510 1.8.4 TXB - Texture Lookup With Bias
515 1.9.1 NRM - 3-component Vector Normalise
517 dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z)
518 dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z)
519 dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z)
525 dst.x = src0.x / src1.x
526 dst.y = src0.y / src1.y
527 dst.z = src0.z / src1.z
528 dst.w = src0.w / src1.w
531 1.9.3 DP2 - 2-component Dot Product
533 dst.x = src0.x * src1.x + src0.y * src1.y
534 dst.y = src0.x * src1.x + src0.y * src1.y
535 dst.z = src0.x * src1.x + src0.y * src1.y
536 dst.w = src0.x * src1.x + src0.y * src1.y
539 1.9.5 TXL - Texture Lookup With LOD
554 1.9.8 BGNFOR - Begin a For-Loop
561 pc = [matching ENDFOR] + 1
564 Note: The destination must be a loop register.
565 The source must be a constant register.
567 Considered for cleanup / removal.
580 1.9.11 ENDIF - End If
585 1.9.12 ENDFOR - End a For-Loop
587 dst.x = dst.x + dst.z
591 pc = [matching BGNFOR instruction] + 1
594 Note: The destination must be a loop register.
596 Considered for cleanup / removal.
598 1.9.13 ENDREP - End Repeat
603 1.10.1 PUSHA - Push Address Register On Stack
610 Considered for cleanup / removal.
612 1.10.2 POPA - Pop Address Register From Stack
619 Considered for cleanup / removal.
622 1.11 GL_NV_gpu_program4
623 ------------------------
625 Support for these opcodes indicated by a special pipe capability bit (TBD).
627 1.11.1 CEIL - Ceiling
635 1.11.2 I2F - Integer To Float
637 dst.x = (float) src.x
638 dst.y = (float) src.y
639 dst.z = (float) src.z
640 dst.w = (float) src.w
643 1.11.3 NOT - Bitwise Not
651 1.11.4 TRUNC - Truncate
659 1.11.5 SHL - Shift Left
661 dst.x = src0.x << src1.x
662 dst.y = src0.y << src1.x
663 dst.z = src0.z << src1.x
664 dst.w = src0.w << src1.x
667 1.11.6 SHR - Shift Right
669 dst.x = src0.x >> src1.x
670 dst.y = src0.y >> src1.x
671 dst.z = src0.z >> src1.x
672 dst.w = src0.w >> src1.x
675 1.11.7 AND - Bitwise And
677 dst.x = src0.x & src1.x
678 dst.y = src0.y & src1.y
679 dst.z = src0.z & src1.z
680 dst.w = src0.w & src1.w
683 1.11.8 OR - Bitwise Or
685 dst.x = src0.x | src1.x
686 dst.y = src0.y | src1.y
687 dst.z = src0.z | src1.z
688 dst.w = src0.w | src1.w
693 dst.x = src0.x % src1.x
694 dst.y = src0.y % src1.y
695 dst.z = src0.z % src1.z
696 dst.w = src0.w % src1.w
699 1.11.10 XOR - Bitwise Xor
701 dst.x = src0.x ^ src1.x
702 dst.y = src0.y ^ src1.y
703 dst.z = src0.z ^ src1.z
704 dst.w = src0.w ^ src1.w
707 1.11.11 SAD - Sum Of Absolute Differences
709 dst.x = abs(src0.x - src1.x) + src2.x
710 dst.y = abs(src0.y - src1.y) + src2.y
711 dst.z = abs(src0.z - src1.z) + src2.z
712 dst.w = abs(src0.w - src1.w) + src2.w
715 1.11.12 TXF - Texel Fetch
720 1.11.13 TXQ - Texture Size Query
725 1.11.14 CONT - Continue
730 1.12 GL_NV_geometry_program4
731 -----------------------------
739 1.12.2 ENDPRIM - End Primitive
748 1.13.1 BGNLOOP - Begin a Loop
753 1.13.2 BGNSUB - Begin Subroutine
758 1.13.3 ENDLOOP - End a Loop
763 1.13.4 ENDSUB - End Subroutine
769 1.13.10 NOP - No Operation
775 1.16.7 NRM4 - 4-component Vector Normalise
777 dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
778 dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
779 dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
780 dst.w = src.w / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
787 1.17.2 CALLNZ - Subroutine Call If Not Zero
797 1.17.5 BREAKC - Break Conditional
802 2 Explanation of symbols used
803 ==============================
810 abs(x) Absolute value of x.
814 ceil(x) Ceiling of x.
816 clamp(x,y,z) Clamp x between y and z.
817 (x < y) ? y : (x > z) ? z : x
823 lg2(x) Logarithm base 2 of x.
825 max(x,y) Maximum of x and y.
828 min(x,y) Minimum of x and y.
831 partialx(x) Derivative of x relative to fragment's X.
833 partialy(x) Derivative of x relative to fragment's Y.
835 pop() Pop from stack.
837 pow(x,y) Raise x to power of y.
839 push(x) Push x on stack.
845 sqrt(x) Square root of x.
854 discard Discard fragment.
856 dst First destination register.
858 dst0 First destination register.
862 src First source register.
864 src0 First source register.
866 src1 Second source register.
868 src2 Third source register.
870 target Label of target instruction.
877 3.1 Declaration Semantic
878 -------------------------
881 Follows Declaration token if Semantic bit is set.
883 Since its purpose is to link a shader with other stages of the pipeline,
884 it is valid to follow only those Declaration tokens that declare a register
885 either in INPUT or OUTPUT file.
887 SemanticName field contains the semantic name of the register being declared.
888 There is no default value.
890 SemanticIndex is an optional subscript that can be used to distinguish
891 different register declarations with the same semantic name. The default value
894 The meanings of the individual semantic names are explained in the following
900 Valid only in a fragment shader INPUT declaration.
902 FACE.x is negative when the primitive is back facing. FACE.x is positive
903 when the primitive is front facing.