4 TGSI, Tungsten Graphics Shader Instructions, is an intermediate language
5 for describing shaders. Since Gallium is inherently shaderful, shaders are
6 an important part of the API. TGSI is the only intermediate representation
9 From GL_NV_vertex_program
10 -------------------------
13 ARL - Address Register Load
39 LIT - Light Coefficients
45 dst.y = max(src.x, 0.0)
47 dst.z = (src.x > 0.0) ? pow(max(src.y, 0.0), clamp(src.w, -128.0, 128.0)) : 0.0
65 RSQ - Reciprocal Square Root
69 dst.x = 1.0 / sqrt(abs(src.x))
71 dst.y = 1.0 / sqrt(abs(src.x))
73 dst.z = 1.0 / sqrt(abs(src.x))
75 dst.w = 1.0 / sqrt(abs(src.x))
78 EXP - Approximate Exponential Base 2
82 dst.x = pow(2.0, floor(src.x))
84 dst.y = src.x - floor(src.x)
86 dst.z = pow(2.0, src.x)
91 LOG - Approximate Logarithm Base 2
95 dst.x = floor(lg2(abs(src.x)))
97 dst.y = abs(src.x) / pow(2.0, floor(lg2(abs(src.x))))
99 dst.z = lg2(abs(src.x))
108 dst.x = src0.x * src1.x
110 dst.y = src0.y * src1.y
112 dst.z = src0.z * src1.z
114 dst.w = src0.w * src1.w
121 dst.x = src0.x + src1.x
123 dst.y = src0.y + src1.y
125 dst.z = src0.z + src1.z
127 dst.w = src0.w + src1.w
130 DP3 - 3-component Dot Product
134 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
136 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
138 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
140 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
143 DP4 - 4-component Dot Product
147 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
149 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
151 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
153 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
156 DST - Distance Vector
162 dst.y = src0.y * src1.y
173 dst.x = min(src0.x, src1.x)
175 dst.y = min(src0.y, src1.y)
177 dst.z = min(src0.z, src1.z)
179 dst.w = min(src0.w, src1.w)
186 dst.x = max(src0.x, src1.x)
188 dst.y = max(src0.y, src1.y)
190 dst.z = max(src0.z, src1.z)
192 dst.w = max(src0.w, src1.w)
195 SLT - Set On Less Than
199 dst.x = (src0.x < src1.x) ? 1.0 : 0.0
201 dst.y = (src0.y < src1.y) ? 1.0 : 0.0
203 dst.z = (src0.z < src1.z) ? 1.0 : 0.0
205 dst.w = (src0.w < src1.w) ? 1.0 : 0.0
208 SGE - Set On Greater Equal Than
212 dst.x = (src0.x >= src1.x) ? 1.0 : 0.0
214 dst.y = (src0.y >= src1.y) ? 1.0 : 0.0
216 dst.z = (src0.z >= src1.z) ? 1.0 : 0.0
218 dst.w = (src0.w >= src1.w) ? 1.0 : 0.0
221 MAD - Multiply And Add
225 dst.x = src0.x * src1.x + src2.x
227 dst.y = src0.y * src1.y + src2.y
229 dst.z = src0.z * src1.z + src2.z
231 dst.w = src0.w * src1.w + src2.w
238 dst.x = src0.x - src1.x
240 dst.y = src0.y - src1.y
242 dst.z = src0.z - src1.z
244 dst.w = src0.w - src1.w
247 LRP - Linear Interpolate
251 dst.x = src0.x * (src1.x - src2.x) + src2.x
253 dst.y = src0.y * (src1.y - src2.y) + src2.y
255 dst.z = src0.z * (src1.z - src2.z) + src2.z
257 dst.w = src0.w * (src1.w - src2.w) + src2.w
264 dst.x = (src2.x > 0.5) ? src0.x : src1.x
266 dst.y = (src2.y > 0.5) ? src0.y : src1.y
268 dst.z = (src2.z > 0.5) ? src0.z : src1.z
270 dst.w = (src2.w > 0.5) ? src0.w : src1.w
273 DP2A - 2-component Dot Product And Add
277 dst.x = src0.x * src1.x + src0.y * src1.y + src2.x
279 dst.y = src0.x * src1.x + src0.y * src1.y + src2.x
281 dst.z = src0.x * src1.x + src0.y * src1.y + src2.x
283 dst.w = src0.x * src1.x + src0.y * src1.y + src2.x
290 dst.x = src.x - floor(src.x)
292 dst.y = src.y - floor(src.y)
294 dst.z = src.z - floor(src.z)
296 dst.w = src.w - floor(src.w)
303 dst.x = clamp(src0.x, src1.x, src2.x)
304 dst.y = clamp(src0.y, src1.y, src2.y)
305 dst.z = clamp(src0.z, src1.z, src2.z)
306 dst.w = clamp(src0.w, src1.w, src2.w)
329 1.3.10 EX2 - Exponential Base 2
333 dst.x = pow(2.0, src.x)
334 dst.y = pow(2.0, src.x)
335 dst.z = pow(2.0, src.x)
336 dst.w = pow(2.0, src.x)
339 1.3.11 LG2 - Logarithm Base 2
353 dst.x = pow(src0.x, src1.x)
354 dst.y = pow(src0.x, src1.x)
355 dst.z = pow(src0.x, src1.x)
356 dst.w = pow(src0.x, src1.x)
358 1.3.15 XPD - Cross Product
362 dst.x = src0.y * src1.z - src1.y * src0.z
363 dst.y = src0.z * src1.x - src1.z * src0.x
364 dst.z = src0.x * src1.y - src1.x * src0.y
378 1.4.2 RCC - Reciprocal Clamped
382 dst.x = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
383 dst.y = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
384 dst.z = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
385 dst.w = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
388 1.4.3 DPH - Homogeneous Dot Product
392 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
393 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
394 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
395 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
408 1.5.2 DDX - Derivative Relative To X
412 dst.x = partialx(src.x)
413 dst.y = partialx(src.y)
414 dst.z = partialx(src.z)
415 dst.w = partialx(src.w)
418 1.5.3 DDY - Derivative Relative To Y
422 dst.x = partialy(src.x)
423 dst.y = partialy(src.y)
424 dst.z = partialy(src.z)
425 dst.w = partialy(src.w)
428 1.5.7 KILP - Predicated Discard
435 1.5.10 PK2H - Pack Two 16-bit Floats
440 1.5.11 PK2US - Pack Two Unsigned 16-bit Scalars
445 1.5.12 PK4B - Pack Four Signed 8-bit Scalars
450 1.5.13 PK4UB - Pack Four Unsigned 8-bit Scalars
455 1.5.15 RFL - Reflection Vector
459 dst.x = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.x - src1.x
460 dst.y = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.y - src1.y
461 dst.z = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.z - src1.z
464 Considered for removal.
467 1.5.16 SEQ - Set On Equal
471 dst.x = (src0.x == src1.x) ? 1.0 : 0.0
472 dst.y = (src0.y == src1.y) ? 1.0 : 0.0
473 dst.z = (src0.z == src1.z) ? 1.0 : 0.0
474 dst.w = (src0.w == src1.w) ? 1.0 : 0.0
477 1.5.17 SFL - Set On False
486 Considered for removal.
488 1.5.18 SGT - Set On Greater Than
492 dst.x = (src0.x > src1.x) ? 1.0 : 0.0
493 dst.y = (src0.y > src1.y) ? 1.0 : 0.0
494 dst.z = (src0.z > src1.z) ? 1.0 : 0.0
495 dst.w = (src0.w > src1.w) ? 1.0 : 0.0
508 1.5.20 SLE - Set On Less Equal Than
512 dst.x = (src0.x <= src1.x) ? 1.0 : 0.0
513 dst.y = (src0.y <= src1.y) ? 1.0 : 0.0
514 dst.z = (src0.z <= src1.z) ? 1.0 : 0.0
515 dst.w = (src0.w <= src1.w) ? 1.0 : 0.0
518 1.5.21 SNE - Set On Not Equal
522 dst.x = (src0.x != src1.x) ? 1.0 : 0.0
523 dst.y = (src0.y != src1.y) ? 1.0 : 0.0
524 dst.z = (src0.z != src1.z) ? 1.0 : 0.0
525 dst.w = (src0.w != src1.w) ? 1.0 : 0.0
528 1.5.22 STR - Set On True
538 1.5.23 TEX - Texture Lookup
543 1.5.24 TXD - Texture Lookup with Derivatives
548 1.5.25 TXP - Projective Texture Lookup
553 1.5.26 UP2H - Unpack Two 16-Bit Floats
557 Considered for removal.
559 1.5.27 UP2US - Unpack Two Unsigned 16-Bit Scalars
563 Considered for removal.
565 1.5.28 UP4B - Unpack Four Signed 8-Bit Values
569 Considered for removal.
571 1.5.29 UP4UB - Unpack Four Unsigned 8-Bit Scalars
575 Considered for removal.
577 1.5.30 X2D - 2D Coordinate Transformation
581 dst.x = src0.x + src1.x * src2.x + src1.y * src2.y
582 dst.y = src0.y + src1.x * src2.z + src1.y * src2.w
583 dst.z = src0.x + src1.x * src2.x + src1.y * src2.y
584 dst.w = src0.y + src1.x * src2.z + src1.y * src2.w
586 Considered for removal.
589 1.6 GL_NV_vertex_program2
590 --------------------------
593 1.6.1 ARA - Address Register Add
597 Considered for removal.
599 1.6.2 ARR - Address Register Load With Round
613 Considered for removal.
615 1.6.4 CAL - Subroutine Call
621 1.6.5 RET - Subroutine Call Return
625 Potential restrictions:
626 * Only occurs at end of function.
632 dst.x = (src.x > 0.0) ? 1.0 : (src.x < 0.0) ? -1.0 : 0.0
633 dst.y = (src.y > 0.0) ? 1.0 : (src.y < 0.0) ? -1.0 : 0.0
634 dst.z = (src.z > 0.0) ? 1.0 : (src.z < 0.0) ? -1.0 : 0.0
635 dst.w = (src.w > 0.0) ? 1.0 : (src.w < 0.0) ? -1.0 : 0.0
642 dst.x = (src0.x < 0.0) ? src1.x : src2.x
643 dst.y = (src0.y < 0.0) ? src1.y : src2.y
644 dst.z = (src0.z < 0.0) ? src1.z : src2.z
645 dst.w = (src0.w < 0.0) ? src1.w : src2.w
648 1.8.2 KIL - Conditional Discard
652 if (src.x < 0.0 || src.y < 0.0 || src.z < 0.0 || src.w < 0.0)
657 1.8.3 SCS - Sine Cosine
667 1.8.4 TXB - Texture Lookup With Bias
672 1.9.1 NRM - 3-component Vector Normalise
676 dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z)
677 dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z)
678 dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z)
686 dst.x = src0.x / src1.x
687 dst.y = src0.y / src1.y
688 dst.z = src0.z / src1.z
689 dst.w = src0.w / src1.w
692 1.9.3 DP2 - 2-component Dot Product
696 dst.x = src0.x * src1.x + src0.y * src1.y
697 dst.y = src0.x * src1.x + src0.y * src1.y
698 dst.z = src0.x * src1.x + src0.y * src1.y
699 dst.w = src0.x * src1.x + src0.y * src1.y
702 1.9.5 TXL - Texture Lookup With LOD
717 1.9.8 BGNFOR - Begin a For-Loop
724 pc = [matching ENDFOR] + 1
727 Note: The destination must be a loop register.
728 The source must be a constant register.
730 Considered for cleanup / removal.
743 1.9.11 ENDIF - End If
748 1.9.12 ENDFOR - End a For-Loop
750 dst.x = dst.x + dst.z
754 pc = [matching BGNFOR instruction] + 1
757 Note: The destination must be a loop register.
759 Considered for cleanup / removal.
761 1.9.13 ENDREP - End Repeat
766 1.10.1 PUSHA - Push Address Register On Stack
773 Considered for cleanup / removal.
775 1.10.2 POPA - Pop Address Register From Stack
782 Considered for cleanup / removal.
785 1.11 GL_NV_gpu_program4
786 ------------------------
788 Support for these opcodes indicated by a special pipe capability bit (TBD).
790 1.11.1 CEIL - Ceiling
800 1.11.2 I2F - Integer To Float
804 dst.x = (float) src.x
805 dst.y = (float) src.y
806 dst.z = (float) src.z
807 dst.w = (float) src.w
810 1.11.3 NOT - Bitwise Not
820 1.11.4 TRUNC - Truncate
830 1.11.5 SHL - Shift Left
834 dst.x = src0.x << src1.x
835 dst.y = src0.y << src1.x
836 dst.z = src0.z << src1.x
837 dst.w = src0.w << src1.x
840 1.11.6 SHR - Shift Right
844 dst.x = src0.x >> src1.x
845 dst.y = src0.y >> src1.x
846 dst.z = src0.z >> src1.x
847 dst.w = src0.w >> src1.x
850 1.11.7 AND - Bitwise And
854 dst.x = src0.x & src1.x
855 dst.y = src0.y & src1.y
856 dst.z = src0.z & src1.z
857 dst.w = src0.w & src1.w
860 1.11.8 OR - Bitwise Or
864 dst.x = src0.x | src1.x
865 dst.y = src0.y | src1.y
866 dst.z = src0.z | src1.z
867 dst.w = src0.w | src1.w
874 dst.x = src0.x % src1.x
875 dst.y = src0.y % src1.y
876 dst.z = src0.z % src1.z
877 dst.w = src0.w % src1.w
880 1.11.10 XOR - Bitwise Xor
884 dst.x = src0.x ^ src1.x
885 dst.y = src0.y ^ src1.y
886 dst.z = src0.z ^ src1.z
887 dst.w = src0.w ^ src1.w
890 1.11.11 SAD - Sum Of Absolute Differences
894 dst.x = abs(src0.x - src1.x) + src2.x
895 dst.y = abs(src0.y - src1.y) + src2.y
896 dst.z = abs(src0.z - src1.z) + src2.z
897 dst.w = abs(src0.w - src1.w) + src2.w
900 1.11.12 TXF - Texel Fetch
905 1.11.13 TXQ - Texture Size Query
910 1.11.14 CONT - Continue
915 1.12 GL_NV_geometry_program4
916 -----------------------------
924 1.12.2 ENDPRIM - End Primitive
933 1.13.1 BGNLOOP - Begin a Loop
938 1.13.2 BGNSUB - Begin Subroutine
943 1.13.3 ENDLOOP - End a Loop
948 1.13.4 ENDSUB - End Subroutine
954 1.13.10 NOP - No Operation
960 1.16.7 NRM4 - 4-component Vector Normalise
964 dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
965 dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
966 dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
967 dst.w = src.w / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
974 1.17.2 CALLNZ - Subroutine Call If Not Zero
984 1.17.5 BREAKC - Break Conditional
989 2 Explanation of symbols used
990 ==============================
997 abs(x) Absolute value of x.
1000 ceil(x) Ceiling of x.
1002 clamp(x,y,z) Clamp x between y and z.
1003 (x < y) ? y : (x > z) ? z : x
1007 floor(x) Floor of x.
1009 lg2(x) Logarithm base 2 of x.
1011 max(x,y) Maximum of x and y.
1014 min(x,y) Minimum of x and y.
1017 partialx(x) Derivative of x relative to fragment's X.
1019 partialy(x) Derivative of x relative to fragment's Y.
1021 pop() Pop from stack.
1023 pow(x,y) Raise x to power of y.
1025 push(x) Push x on stack.
1031 sqrt(x) Square root of x.
1033 trunc(x) Truncate x.
1040 discard Discard fragment.
1042 dst First destination register.
1044 dst0 First destination register.
1048 src First source register.
1050 src0 First source register.
1052 src1 Second source register.
1054 src2 Third source register.
1056 target Label of target instruction.
1063 3.1 Declaration Semantic
1064 -------------------------
1067 Follows Declaration token if Semantic bit is set.
1069 Since its purpose is to link a shader with other stages of the pipeline,
1070 it is valid to follow only those Declaration tokens that declare a register
1071 either in INPUT or OUTPUT file.
1073 SemanticName field contains the semantic name of the register being declared.
1074 There is no default value.
1076 SemanticIndex is an optional subscript that can be used to distinguish
1077 different register declarations with the same semantic name. The default value
1080 The meanings of the individual semantic names are explained in the following
1086 Valid only in a fragment shader INPUT declaration.
1088 FACE.x is negative when the primitive is back facing. FACE.x is positive
1089 when the primitive is front facing.