#ifndef BRW_EU_DEFINES_H
#define BRW_EU_DEFINES_H
+#include <stdint.h>
+#include <stdlib.h>
#include "util/macros.h"
/* The following hunk, up-to "Execution Unit" is used by both the
enum opcode {
/* These are the actual hardware instructions. */
BRW_OPCODE_ILLEGAL,
+ BRW_OPCODE_SYNC,
BRW_OPCODE_MOV,
BRW_OPCODE_SEL,
BRW_OPCODE_MOVI, /**< G45+ */
*/
SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
+ SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
+ SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
/**
* Memory fence messages.
*
* Source 0: Must be register g0, used as header.
- * Source 1: Immediate bool to indicate whether or not we need to stall
- * until memory transactions prior to the fence are completed.
+ * Source 1: Immediate bool to indicate whether control is returned to the
+ * thread only after the fence has been honored.
* Source 2: Immediate byte indicating which memory to fence. Zero means
* global memory; GEN7_BTI_SLM means SLM (for Gen11+ only).
*
*/
SHADER_OPCODE_MEMORY_FENCE,
+ /**
+ * Scheduling-only fence.
+ *
+ * Sources can be used to force a stall until the registers in those are
+ * available. This might generate MOVs or SYNC_NOPs (Gen12+).
+ */
+ FS_OPCODE_SCHEDULING_FENCE,
+
SHADER_OPCODE_GEN4_SCRATCH_READ,
SHADER_OPCODE_GEN4_SCRATCH_WRITE,
SHADER_OPCODE_GEN7_SCRATCH_READ,
*/
SHADER_OPCODE_FIND_LIVE_CHANNEL,
+ /**
+ * Return the current execution mask in the specified flag subregister.
+ * Can be CSE'ed more easily than a plain MOV from the ce0 ARF register.
+ */
+ FS_OPCODE_LOAD_LIVE_CHANNELS,
+
/**
* Pick the channel from its first source register given by the index
* specified as second source. Useful for variable indexing of surfaces.
*/
SHADER_OPCODE_MULH,
+ /** Signed subtraction with saturation. */
+ SHADER_OPCODE_ISUB_SAT,
+
+ /** Unsigned subtraction with saturation. */
+ SHADER_OPCODE_USUB_SAT,
+
/**
* A MOV that uses VxH indirect addressing.
*
BRW_WIDTH_16 = 4,
};
+/**
+ * Gen12+ SWSB SBID synchronization mode.
+ *
+ * This is represented as a bitmask including any required SBID token
+ * synchronization modes, used to synchronize out-of-order instructions. Only
+ * the strongest mode of the mask will be provided to the hardware in the SWSB
+ * field of an actual hardware instruction, but virtual instructions may be
+ * able to take into account multiple of them.
+ */
+enum tgl_sbid_mode {
+ TGL_SBID_NULL = 0,
+ TGL_SBID_SRC = 1,
+ TGL_SBID_DST = 2,
+ TGL_SBID_SET = 4
+};
+
+#ifdef __cplusplus
+/**
+ * Allow bitwise arithmetic of tgl_sbid_mode enums.
+ */
+inline tgl_sbid_mode
+operator|(tgl_sbid_mode x, tgl_sbid_mode y)
+{
+ return tgl_sbid_mode(unsigned(x) | unsigned(y));
+}
+
+inline tgl_sbid_mode
+operator&(tgl_sbid_mode x, tgl_sbid_mode y)
+{
+ return tgl_sbid_mode(unsigned(x) & unsigned(y));
+}
+
+inline tgl_sbid_mode &
+operator|=(tgl_sbid_mode &x, tgl_sbid_mode y)
+{
+ return x = x | y;
+}
+
+#endif
+
+/**
+ * Logical representation of the SWSB scheduling information of a hardware
+ * instruction. The binary representation is slightly more compact.
+ */
+struct tgl_swsb {
+ unsigned regdist : 3;
+ unsigned sbid : 4;
+ enum tgl_sbid_mode mode : 3;
+};
+
+/**
+ * Construct a scheduling annotation with a single RegDist dependency. This
+ * synchronizes with the completion of the d-th previous in-order instruction.
+ * The index is one-based, zero causes a no-op tgl_swsb to be constructed.
+ */
+static inline struct tgl_swsb
+tgl_swsb_regdist(unsigned d)
+{
+ const struct tgl_swsb swsb = { d };
+ assert(swsb.regdist == d);
+ return swsb;
+}
+
+/**
+ * Construct a scheduling annotation that synchronizes with the specified SBID
+ * token.
+ */
+static inline struct tgl_swsb
+tgl_swsb_sbid(enum tgl_sbid_mode mode, unsigned sbid)
+{
+ const struct tgl_swsb swsb = { 0, sbid, mode };
+ assert(swsb.sbid == sbid);
+ return swsb;
+}
+
+/**
+ * Construct a no-op scheduling annotation.
+ */
+static inline struct tgl_swsb
+tgl_swsb_null(void)
+{
+ return tgl_swsb_regdist(0);
+}
+
+/**
+ * Return a scheduling annotation that allocates the same SBID synchronization
+ * token as \p swsb. In addition it will synchronize against a previous
+ * in-order instruction if \p regdist is non-zero.
+ */
+static inline struct tgl_swsb
+tgl_swsb_dst_dep(struct tgl_swsb swsb, unsigned regdist)
+{
+ swsb.regdist = regdist;
+ swsb.mode = swsb.mode & TGL_SBID_SET;
+ return swsb;
+}
+
+/**
+ * Return a scheduling annotation that synchronizes against the same SBID and
+ * RegDist dependencies as \p swsb, but doesn't allocate any SBID token.
+ */
+static inline struct tgl_swsb
+tgl_swsb_src_dep(struct tgl_swsb swsb)
+{
+ swsb.mode = swsb.mode & (TGL_SBID_SRC | TGL_SBID_DST);
+ return swsb;
+}
+
+/**
+ * Convert the provided tgl_swsb to the hardware's binary representation of an
+ * SWSB annotation.
+ */
+static inline uint8_t
+tgl_swsb_encode(struct tgl_swsb swsb)
+{
+ if (!swsb.mode) {
+ return swsb.regdist;
+ } else if (swsb.regdist) {
+ return 0x80 | swsb.regdist << 4 | swsb.sbid;
+ } else {
+ return swsb.sbid | (swsb.mode & TGL_SBID_SET ? 0x40 :
+ swsb.mode & TGL_SBID_DST ? 0x20 : 0x30);
+ }
+}
+
+/**
+ * Convert the provided binary representation of an SWSB annotation to a
+ * tgl_swsb.
+ */
+static inline struct tgl_swsb
+tgl_swsb_decode(enum opcode opcode, uint8_t x)
+{
+ if (x & 0x80) {
+ const struct tgl_swsb swsb = { (x & 0x70u) >> 4, x & 0xfu,
+ (opcode == BRW_OPCODE_SEND ||
+ opcode == BRW_OPCODE_SENDC ||
+ opcode == BRW_OPCODE_MATH) ?
+ TGL_SBID_SET : TGL_SBID_DST };
+ return swsb;
+ } else if ((x & 0x70) == 0x20) {
+ return tgl_swsb_sbid(TGL_SBID_DST, x & 0xfu);
+ } else if ((x & 0x70) == 0x30) {
+ return tgl_swsb_sbid(TGL_SBID_SRC, x & 0xfu);
+ } else if ((x & 0x70) == 0x40) {
+ return tgl_swsb_sbid(TGL_SBID_SET, x & 0xfu);
+ } else {
+ return tgl_swsb_regdist(x & 0x7u);
+ }
+}
+
+enum tgl_sync_function {
+ TGL_SYNC_NOP = 0x0,
+ TGL_SYNC_ALLRD = 0x2,
+ TGL_SYNC_ALLWR = 0x3,
+ TGL_SYNC_BAR = 0xe,
+ TGL_SYNC_HOST = 0xf
+};
+
/**
* Message target: Shared Function ID for where to SEND a message.
*
/* Dataport special binding table indices: */
#define BRW_BTI_STATELESS 255
#define GEN7_BTI_SLM 254
-/* Note that on Gen8+ BTI 255 was redefined to be IA-coherent according to the
- * hardware spec, however because the DRM sets bit 4 of HDC_CHICKEN0 on BDW,
- * CHV and at least some pre-production steppings of SKL due to
- * WaForceEnableNonCoherent, HDC memory access may have been overridden by the
- * kernel to be non-coherent (matching the behavior of the same BTI on
- * pre-Gen8 hardware) and BTI 255 may actually be an alias for BTI 253.
+
+#define HSW_BTI_STATELESS_LOCALLY_COHERENT 255
+#define HSW_BTI_STATELESS_NON_COHERENT 253
+#define HSW_BTI_STATELESS_GLOBALLY_COHERENT 252
+#define HSW_BTI_STATELESS_LLC_COHERENT 251
+#define HSW_BTI_STATELESS_L3_UNCACHED 250
+
+/* The hardware docs are a bit contradictory here. On Haswell, where they
+ * first added cache ability control, there were 5 different cache modes (see
+ * HSW_BTI_STATELESS_* above). On Broadwell, they reduced to two:
+ *
+ * - IA-Coherent (BTI=255): Coherent within Gen and coherent within the
+ * entire IA cache memory hierarchy.
+ *
+ * - Non-Coherent (BTI=253): Coherent within Gen, same cache type.
+ *
+ * Information about stateless cache coherency can be found in the "A32
+ * Stateless" section of the "3D Media GPGPU" volume of the PRM for each
+ * hardware generation.
+ *
+ * Unfortunately, the docs for MDC_STATELESS appear to have been copied and
+ * pasted from Haswell and give the Haswell definitions for the BTI values of
+ * 255 and 253 including a warning about accessing 253 surfaces from multiple
+ * threads. This seems to be a copy+paste error and the definitions from the
+ * "A32 Stateless" section should be trusted instead.
+ *
+ * Note that because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, CHV and at
+ * least some pre-production steppings of SKL due to WaForceEnableNonCoherent,
+ * HDC memory access may have been overridden by the kernel to be non-coherent
+ * (matching the behavior of the same BTI on pre-Gen8 hardware) and BTI 255
+ * may actually be an alias for BTI 253.
*/
#define GEN8_BTI_STATELESS_IA_COHERENT 255
#define GEN8_BTI_STATELESS_NON_COHERENT 253