gpu-compute: parametrize Wavefront size

author jkalamat <john.kalamatianos@amd.com>

Thu, 9 Jun 2016 15:24:55 +0000 (11:24 -0400)

committer jkalamat <john.kalamatianos@amd.com>

Thu, 9 Jun 2016 15:24:55 +0000 (11:24 -0400)
author jkalamat <john.kalamatianos@amd.com>
Thu, 9 Jun 2016 15:24:55 +0000 (11:24 -0400)
committer jkalamat <john.kalamatianos@amd.com>
Thu, 9 Jun 2016 15:24:55 +0000 (11:24 -0400)
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py

index 75819b505ebc746af3f6f94dcd4532ddf7485190..27a26071bea6015ad83904f0fe70ae1021687868 100644 (file)
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -250,7 +250,8 @@ for i in xrange(n_cu):
      vrfs = []
      for j in xrange(options.simds_per_cu):
          for k in xrange(shader.n_wf):
-            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
+            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
+                                        wfSize = options.wf_size))
          vrfs.append(VectorRegisterFile(simd_id=j,
                                num_regs_per_simd=options.vreg_file_size))
      compute_units[-1].wavefronts = wavefronts
diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py

index bb369fd1090cbb0d29120bcebd83a18096118abf..f7768054134d4748002ff65e35bfe644ab88c9ed 100755 (executable)
--- a/src/arch/hsail/gen.py
+++ b/src/arch/hsail/gen.py
@@ -235,7 +235,7 @@ $class_name::execute(GPUDynInstPtr gpuDynInst)
  
      const VectorMask &mask = w->get_pred();
  
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
          if (mask[lane]) {
              DestCType dest_val = $expr;
              this->dest.set(w, lane, dest_val);
@@ -256,7 +256,7 @@ $class_name::execute(GPUDynInstPtr gpuDynInst)
  
      const VectorMask &mask = w->get_pred();
  
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
          if (mask[lane]) {
              SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
              DestCType dest_val = $expr;
@@ -277,7 +277,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
  
      const VectorMask &mask = w->get_pred();
  
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
          if (mask[lane]) {
              CType dest_val;
              if ($dest_is_src_flag) {
@@ -312,7 +312,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
  
      const VectorMask &mask = w->get_pred();
  
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
          if (mask[lane]) {
              CType dest_val;
  
@@ -346,7 +346,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
  
      const VectorMask &mask = w->get_pred();
  
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
          if (mask[lane]) {
              DestT dest_val;
              if ($dest_is_src_flag) {
@@ -372,7 +372,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
      Wavefront *w = gpuDynInst->wavefront();
  
      const VectorMask &mask = w->get_pred();
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
          if (mask[lane]) {
              CType dest_val;
  
@@ -401,7 +401,7 @@ $class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
  
      const VectorMask &mask = w->get_pred();
  
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
          if (mask[lane]) {
              DestCType dest_val;
              SrcCType src_val[$num_srcs];
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh

index f4b00fc8d2b1f4292e0bd06c0e824d978b6d2ba6..45cd876ad147069cec8bb045e511a1be4c038027 100644 (file)
--- a/src/arch/hsail/insts/branch.hh
+++ b/src/arch/hsail/insts/branch.hh
@@ -279,7 +279,7 @@ namespace HsailISA
          // taken branch
          const uint32_t true_pc = getTargetPc();
          VectorMask true_mask;
-        for (unsigned int lane = 0; lane < VSZ; ++lane) {
+        for (unsigned int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
          }
  
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc

index 4e70bf46a95665e9014501c202100c523575f683..004054524cf9a7afdcfc6fe0685766ffe7175c22 100644 (file)
--- a/src/arch/hsail/insts/main.cc
+++ b/src/arch/hsail/insts/main.cc
@@ -134,7 +134,7 @@ namespace HsailISA
          const VectorMask &mask = w->get_pred();
  
          // mask off completed work-items
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  w->init_mask[lane] = 0;
              }
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh

index f2792cd49c13eeee7c16f5753f69ba6e61f9ce0e..1db98d212d7f09b97b01330b8c47ae9809493dc9 100644 (file)
--- a/src/arch/hsail/insts/mem.hh
+++ b/src/arch/hsail/insts/mem.hh
@@ -457,7 +457,7 @@ namespace HsailISA
              gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
  
              if (num_dest_operands > 1) {
-                for (int i = 0; i < VSZ; ++i)
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
                      if (gpuDynInst->exec_mask[i])
                          gpuDynInst->statusVector.push_back(num_dest_operands);
                      else
@@ -466,9 +466,10 @@ namespace HsailISA
  
              for (int k = 0; k < num_dest_operands; ++k) {
  
-                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+                c0 *d = &((c0*)gpuDynInst->d_data)
+                    [k * gpuDynInst->computeUnit()->wfSize()];
  
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                      if (gpuDynInst->exec_mask[i]) {
                          Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
  
@@ -1004,7 +1005,7 @@ namespace HsailISA
              gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
  
              if (num_src_operands > 1) {
-                for (int i = 0; i < VSZ; ++i)
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
                      if (gpuDynInst->exec_mask[i])
                          gpuDynInst->statusVector.push_back(num_src_operands);
                      else
@@ -1012,9 +1013,10 @@ namespace HsailISA
              }
  
              for (int k = 0; k < num_src_operands; ++k) {
-                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+                c0 *d = &((c0*)gpuDynInst->d_data)
+                    [k * gpuDynInst->computeUnit()->wfSize()];
  
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                      if (gpuDynInst->exec_mask[i]) {
                          Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
  
@@ -1402,7 +1404,7 @@ namespace HsailISA
              c0 *e = &((c0*) gpuDynInst->a_data)[0];
              c0 *f = &((c0*) gpuDynInst->x_data)[0];
  
-            for (int i = 0; i < VSZ; ++i) {
+            for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                  if (gpuDynInst->exec_mask[i]) {
                      Addr vaddr = gpuDynInst->addr[i];
  
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh

index 94f0cd6aac0d1241832f1a140369ba6690fb7ca5..8329c6e8a22f070164c9063b365c2327d8d71220 100644 (file)
--- a/src/arch/hsail/insts/mem_impl.hh
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -60,14 +60,16 @@ namespace HsailISA
  
          typedef typename DestDataType::CType CType M5_VAR_USED;
          const VectorMask &mask = w->get_pred();
-        uint64_t addr_vec[VSZ];
+        std::vector<Addr> addr_vec;
+        addr_vec.resize(w->computeUnit->wfSize(), (Addr)0);
          this->addr.calcVector(w, addr_vec);
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  this->dest.set(w, lane, addr_vec[lane]);
              }
          }
+        addr_vec.clear();
      }
  
      template<typename MemDataType, typename DestDataType,
@@ -121,8 +123,8 @@ namespace HsailISA
              i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
          assert(se);
  
-        return w->wfSlotId * w->privSizePerItem * VSZ +
-            se->offset * VSZ +
+        return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() +
+            se->offset * w->computeUnit->wfSize() +
              lane * se->size;
          */
  
@@ -139,9 +141,11 @@ namespace HsailISA
          Addr addr_div8 = addr / 8;
          Addr addr_mod8 = addr % 8;
  
-        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+        Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 +
+            addr_mod8 + w->privBase;
  
-        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+        assert(ret < w->privBase +
+               (w->privSizePerItem * w->computeUnit->wfSize()));
  
          return ret;
      }
@@ -175,7 +179,7 @@ namespace HsailISA
  
              DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
  
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                  if (mask[lane]) {
                      this->dest.set(w, lane, val);
                  }
@@ -184,7 +188,7 @@ namespace HsailISA
              return;
          } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
              uint64_t address = this->addr.calcUniform();
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                  if (mask[lane]) {
                      MemCType val = w->readCallArgMem<MemCType>(lane, address);
  
@@ -239,7 +243,7 @@ namespace HsailISA
              // this is a complete hack to get around a compiler bug
              // (the compiler currently generates global access for private
              //  addresses (starting from 0). We need to add the private offset)
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                  if (m->addr[lane] < w->privSizePerItem) {
                      if (mask[lane]) {
                          // what is the size of the object we are accessing?
@@ -267,7 +271,7 @@ namespace HsailISA
              m->pipeId = GLBMEM_PIPE;
              m->latency.set(w->computeUnit->shader->ticks(1));
              {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                      //  note: this calculation will NOT WORK if the compiler
                      //  ever generates loads/stores to the same address with
                      //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
@@ -301,7 +305,7 @@ namespace HsailISA
              m->pipeId = GLBMEM_PIPE;
              m->latency.set(w->computeUnit->shader->ticks(1));
  
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                  if (mask[lane]) {
                      assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
                      m->addr[lane] += w->roBase;
@@ -318,7 +322,7 @@ namespace HsailISA
              m->pipeId = GLBMEM_PIPE;
              m->latency.set(w->computeUnit->shader->ticks(1));
              {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                      if (mask[lane]) {
                          assert(m->addr[lane] < w->privSizePerItem);
  
@@ -360,7 +364,7 @@ namespace HsailISA
          if (this->segment == Brig::BRIG_SEGMENT_ARG) {
              uint64_t address = this->addr.calcUniform();
  
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                  if (mask[lane]) {
                      CType data = this->src.template get<CType>(w, lane);
                      DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
@@ -378,7 +382,7 @@ namespace HsailISA
          this->addr.calcVector(w, m->addr);
  
          if (num_src_operands == 1) {
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                  if (mask[lane]) {
                      ((CType*)m->d_data)[lane] =
                          this->src.template get<CType>(w, lane);
@@ -386,9 +390,9 @@ namespace HsailISA
              }
          } else {
              for (int k= 0; k < num_src_operands; ++k) {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                      if (mask[lane]) {
-                        ((CType*)m->d_data)[k * VSZ + lane] =
+                        ((CType*)m->d_data)[k * w->computeUnit->wfSize() + lane] =
                              this->src_vect[k].template get<CType>(w, lane);
                      }
                  }
@@ -428,7 +432,7 @@ namespace HsailISA
              // this is a complete hack to get around a compiler bug
              // (the compiler currently generates global access for private
              //  addresses (starting from 0). We need to add the private offset)
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                  if (mask[lane]) {
                      if (m->addr[lane] < w->privSizePerItem) {
  
@@ -454,7 +458,7 @@ namespace HsailISA
              m->pipeId = GLBMEM_PIPE;
              m->latency.set(w->computeUnit->shader->ticks(1));
              {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                      if (mask[lane]) {
                          assert(m->addr[lane] < w->spillSizePerItem);
  
@@ -483,7 +487,7 @@ namespace HsailISA
              m->pipeId = GLBMEM_PIPE;
              m->latency.set(w->computeUnit->shader->ticks(1));
              {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                      if (mask[lane]) {
                          assert(m->addr[lane] < w->privSizePerItem);
                          m->addr[lane] = m->addr[lane] + lane *
@@ -558,14 +562,14 @@ namespace HsailISA
  
          this->addr.calcVector(w, m->addr);
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              ((CType *)m->a_data)[lane] =
                  this->src[0].template get<CType>(w, lane);
          }
  
          // load second source operand for CAS
          if (NumSrcOperands > 1) {
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                  ((CType*)m->x_data)[lane] =
                      this->src[1].template get<CType>(w, lane);
              }
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc

index 9506a80abc2e1ee7995a81ced4e36d16e72d1f85..56ca8047c50760272505f0f84fcf9b865edca502 100644 (file)
--- a/src/arch/hsail/insts/pseudo_inst.cc
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -84,7 +84,7 @@ namespace HsailISA
          int op = 0;
          bool got_op = false;
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  int src_val0 = src1.get<int>(w, lane, 0);
                  if (got_op) {
@@ -182,7 +182,7 @@ namespace HsailISA
      {
      #if TRACING_ON
          const VectorMask &mask = w->get_pred();
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  int src_val1 = src1.get<int>(w, lane, 1);
                  int src_val2 = src1.get<int>(w, lane, 2);
@@ -205,7 +205,7 @@ namespace HsailISA
      {
      #if TRACING_ON
          const VectorMask &mask = w->get_pred();
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
                  int src_val2 = src1.get<int>(w, lane, 2);
@@ -231,7 +231,7 @@ namespace HsailISA
          std::string res_str;
          res_str = csprintf("krl_prt (%s)\n", disassemble());
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (!(lane & 7)) {
                  res_str += csprintf("DB%03d: ", (int)w->wfDynId);
              }
@@ -270,7 +270,7 @@ namespace HsailISA
          int src_val3 = -1;
          res_str = csprintf("krl_prt (%s)\n", disassemble());
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (!(lane & 7)) {
                  res_str += csprintf("DB%03d: ", (int)w->wfDynId);
              }
@@ -311,7 +311,7 @@ namespace HsailISA
          std::string res_str;
          res_str = csprintf("krl_prt (%s)\n", disassemble());
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (!(lane & 3)) {
                  res_str += csprintf("DB%03d: ", (int)w->wfDynId);
              }
@@ -350,7 +350,7 @@ namespace HsailISA
          int src_val3 = -1;
          res_str = csprintf("krl_prt (%s)\n", disassemble());
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (!(lane & 3)) {
                  res_str += csprintf("DB%03d: ", (int)w->wfDynId);
              }
@@ -391,7 +391,7 @@ namespace HsailISA
          std::string res_str;
          res_str = csprintf("krl_prt (%s)\n", disassemble());
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (!(lane & 7)) {
                  res_str += csprintf("DB%03d: ", (int)w->wfDynId);
              }
@@ -430,7 +430,7 @@ namespace HsailISA
          res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
          res_str += csprintf("  Exec mask: ");
  
-        for (int i = VSZ - 1; i >= 0; --i) {
+        for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
              if (w->execMask(i))
                  res_str += "1";
              else
@@ -458,7 +458,7 @@ namespace HsailISA
          const VectorMask &mask = w->get_pred();
          int res = 0;
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  int src_val1 = src1.get<int>(w, lane, 1);
                  dest.set<int>(w, lane, res);
@@ -477,14 +477,14 @@ namespace HsailISA
          const VectorMask &mask = w->get_pred();
          int res = 0;
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  int src_val1 = src1.get<int>(w, lane, 1);
                  res += src_val1;
              }
          }
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  dest.set<int>(w, lane, res);
              }
@@ -497,19 +497,19 @@ namespace HsailISA
          const VectorMask &mask = w->get_pred();
          int res = 0;
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  int src_val1 = src1.get<int>(w, lane, 1);
  
                  if (src_val1) {
-                    if (lane < (VSZ/2)) {
+                    if (lane < (w->computeUnit->wfSize()/2)) {
                          res = res | ((uint32_t)(1) << lane);
                      }
                  }
              }
          }
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  dest.set<int>(w, lane, res);
              }
@@ -521,19 +521,20 @@ namespace HsailISA
      {
          const VectorMask &mask = w->get_pred();
          int res = 0;
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  int src_val1 = src1.get<int>(w, lane, 1);
  
                  if (src_val1) {
-                    if (lane >= (VSZ/2)) {
-                        res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+                    if (lane >= (w->computeUnit->wfSize()/2)) {
+                        res = res | ((uint32_t)(1) <<
+                                     (lane - (w->computeUnit->wfSize()/2)));
                      }
                  }
              }
          }
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  dest.set<int>(w, lane, res);
              }
@@ -546,7 +547,7 @@ namespace HsailISA
          const VectorMask &mask = w->get_pred();
          int max_cnt = 0;
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  w->bar_cnt[lane]++;
  
@@ -567,7 +568,7 @@ namespace HsailISA
          const VectorMask &mask = w->get_pred();
          int max_cnt = 0;
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  w->bar_cnt[lane]--;
              }
@@ -592,7 +593,7 @@ namespace HsailISA
      {
          const VectorMask &mask = w->get_pred();
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  int src_val1 = src1.get<int>(w, lane, 1);
                  panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
@@ -605,7 +606,7 @@ namespace HsailISA
      Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
      {
          // the address is in src1 | src2
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              int src_val1 = src1.get<int>(w, lane, 1);
              int src_val2 = src1.get<int>(w, lane, 2);
              Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
@@ -622,7 +623,7 @@ namespace HsailISA
  
          calcAddr(w, m);
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
          }
  
@@ -661,7 +662,7 @@ namespace HsailISA
          GPUDynInstPtr m = gpuDynInst;
          calcAddr(w, m);
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
          }
  
@@ -736,7 +737,7 @@ namespace HsailISA
          const VectorMask &mask = w->get_pred();
          int src_val1 = 0;
  
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
              if (mask[lane]) {
                  src_val1 = src1.get<int>(w, lane, 1);
                  break;
@@ -758,7 +759,7 @@ namespace HsailISA
          const VectorMask &mask = w->get_pred();
          unsigned mst = true;
  
-        for (int lane = VSZ - 1; lane >= 0; --lane) {
+        for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
              if (mask[lane]) {
                  dest.set<int>(w, lane, mst);
                  mst = false;
@@ -773,7 +774,7 @@ namespace HsailISA
          int res = 0;
          bool got_res = false;
  
-        for (int lane = VSZ - 1; lane >= 0; --lane) {
+        for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
              if (mask[lane]) {
                  if (!got_res) {
                      res = src1.get<int>(w, lane, 1);
diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh

index e3d275b101ed6240ba0ea9cb67becf478ff409e1..4d981ee00e48d2a082a37477b8bc17f7c13dcd63 100644 (file)
--- a/src/arch/hsail/operand.hh
+++ b/src/arch/hsail/operand.hh
@@ -42,6 +42,7 @@
   *  Defines classes encapsulating HSAIL instruction operands.
   */
  
+#include <limits>
  #include <string>
  
  #include "arch/hsail/Brig.h"
@@ -346,6 +347,8 @@ class CRegOperand : public BaseRegOperand
  template<typename T>
  class ImmOperand : public BaseOperand
  {
+  private:
+    uint16_t kind;
    public:
      T bits;
  
@@ -355,11 +358,21 @@ class ImmOperand : public BaseOperand
  
      template<typename OperandType>
      OperandType
-    get()
+    get(Wavefront *w)
      {
          assert(sizeof(OperandType) <= sizeof(T));
+        panic_if(w == nullptr, "WF pointer needs to be set");
+
+        switch (kind) {
+          // immediate operand is WF size
+          case Brig::BRIG_KIND_OPERAND_WAVESIZE:
+            return (OperandType)w->computeUnit->wfSize();
+            break;
  
-        return *(OperandType*)&bits;
+          default:
+            return *(OperandType*)&bits;
+            break;
+        }
      }
  
      // This version of get() takes a WF* and a lane id for
@@ -368,7 +381,7 @@ class ImmOperand : public BaseOperand
      OperandType
      get(Wavefront *w, int lane)
      {
-        return get<OperandType>();
+        return get<OperandType>(w);
      }
  };
  
@@ -388,16 +401,18 @@ ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
              auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
  
              bits = *((T*)(obj->getData(cbptr->bytes + 4)));
-
+            kind = brigOp->kind;
              return true;
          }
          break;
  
        case Brig::BRIG_KIND_OPERAND_WAVESIZE:
-        bits = VSZ;
+        kind = brigOp->kind;
+        bits = std::numeric_limits<unsigned long long>::digits;
          return true;
  
        default:
+        kind = Brig::BRIG_KIND_NONE;
          return false;
      }
  }
@@ -409,6 +424,7 @@ ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
      const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
  
      if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+        kind = Brig::BRIG_KIND_NONE;
          return false;
      }
  
@@ -423,6 +439,7 @@ ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
          (const Brig::BrigOperand *)obj->getOperand(*data_offset);
  
      if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+        kind = Brig::BRIG_KIND_NONE;
          return false;
      }
  
@@ -456,7 +473,7 @@ class RegOrImmOperand : public BaseOperand
      OperandType
      get(Wavefront *w, int lane)
      {
-        return is_imm ?  imm_op.template get<OperandType>() :
+        return is_imm ?  imm_op.template get<OperandType>(w) :
                           reg_op.template get<OperandType>(w, lane);
      }
  
@@ -571,7 +588,7 @@ class AddrOperandBase : public BaseOperand
      uint64_t calcUniformBase();
  
    public:
-    virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
+    virtual void calcVector(Wavefront *w, std::vector<Addr> &addrVec) = 0;
      virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
  
      uint64_t offset;
@@ -586,7 +603,7 @@ class RegAddrOperand : public AddrOperandBase
      RegOperandType reg;
      void init(unsigned opOffset, const BrigObject *obj);
      uint64_t calcUniform();
-    void calcVector(Wavefront *w, uint64_t *addrVec);
+    void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
      uint64_t calcLane(Wavefront *w, int lane=0);
      uint32_t opSize() { return reg.opSize(); }
      bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
@@ -641,11 +658,12 @@ RegAddrOperand<RegOperandType>::calcUniform()
  
  template<typename RegOperandType>
  void
-RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
+RegAddrOperand<RegOperandType>::calcVector(Wavefront *w,
+                                           std::vector<Addr> &addrVec)
  {
      Addr address = calcUniformBase();
  
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
          if (w->execMask(lane)) {
              if (reg.regFileChar == 's') {
                  addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
@@ -680,7 +698,7 @@ class NoRegAddrOperand : public AddrOperandBase
    public:
      void init(unsigned opOffset, const BrigObject *obj);
      uint64_t calcUniform();
-    void calcVector(Wavefront *w, uint64_t *addrVec);
+    void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
      uint64_t calcLane(Wavefront *w, int lane=0);
      std::string disassemble();
  };
@@ -698,11 +716,11 @@ NoRegAddrOperand::calcLane(Wavefront *w, int lane)
  }
  
  inline void
-NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
+NoRegAddrOperand::calcVector(Wavefront *w, std::vector<Addr> &addrVec)
  {
      uint64_t address = calcUniformBase();
  
-    for (int lane = 0; lane < VSZ; ++lane)
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane)
          addrVec[lane] = address;
  }
  
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py

index bd95f633571a2ab95139f496e839956fcde204bf..f580a09f79e371d7fef20199a27c5847e7eccc4d 100644 (file)
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -59,6 +59,7 @@ class VectorRegisterFile(SimObject):
  
      simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
      num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
      min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
  
  class Wavefront(SimObject):
@@ -68,6 +69,7 @@ class Wavefront(SimObject):
  
      simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
      wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
  
  class ComputeUnit(MemObject):
      type = 'ComputeUnit'
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc

index 3b3291c03ca84ae231f5e95b60de0b1c59848e0a..6bb6be102a160c542e6337621c307bb11f0440ae 100644 (file)
--- a/src/gpu-compute/cl_driver.cc
+++ b/src/gpu-compute/cl_driver.cc
@@ -238,7 +238,7 @@ ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
        case HSA_GET_VSZ:
          {
              BufferArg buf(buf_addr, sizeof(uint32_t));
-            *((uint32_t*)buf.bufferPtr()) = VSZ;
+            *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize();
              buf.copyOut(tc->getMemProxy());
          }
          break;
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index b3a99b1829ee0146811ac82ac123a11ea3245477..5ec061172aa3518ec51b04c82ec8eebfc0ad65bb 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -32,9 +32,10 @@
   *
   * Author: John Kalamatianos, Anthony Gutierrez
   */
-
  #include "gpu-compute/compute_unit.hh"
  
+#include <limits>
+
  #include "base/output.hh"
  #include "debug/GPUDisp.hh"
  #include "debug/GPUExec.hh"
@@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
      _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
      lds(*p->localDataStore), globalSeqNum(0),  wavefrontSize(p->wfSize)
  {
-    // this check will be eliminated once we have wavefront size support added
-    fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+    /**
+     * This check is necessary because std::bitset only provides conversion
+     * to unsigned long or unsigned long long via to_ulong() or to_ullong().
+     * there are * a few places in the code where to_ullong() is used, however
+     * if VSZ is larger than a value the host can support then bitset will
+     * throw a runtime exception. we should remove all use of to_long() or
+     * to_ullong() so we can have VSZ greater than 64b, however until that is
+     * done this assert is required.
+     */
+    fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
+             p->wfSize <= 0,
+             "WF size is larger than the host can support");
+    fatal_if(!isPowerOf2(wavefrontSize),
+             "Wavefront size should be a power of 2");
      // calculate how many cycles a vector load or store will need to transfer
      // its data over the corresponding buses
-    numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
-                                / (double)vrfToCoalescerBusWidth);
+    numCyclesPerStoreTransfer =
+        (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
+                (double)vrfToCoalescerBusWidth);
  
-    numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+    numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
                                 / coalescerToVrfBusWidth;
  
      lastVaddrWF.resize(numSIMDs);
@@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
          lastVaddrWF[j].resize(p->n_wf);
  
          for (int i = 0; i < p->n_wf; ++i) {
-            lastVaddrWF[j][i].resize(VSZ);
+            lastVaddrWF[j][i].resize(wfSize());
  
              wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
              wfList[j][i]->setParent(this);
  
-            for (int k = 0; k < VSZ; ++k) {
+            for (int k = 0; k < wfSize(); ++k) {
                  lastVaddrWF[j][i][k] = 0;
              }
          }
      }
  
-    lastVaddrPhase.resize(numSIMDs);
+    lastVaddrSimd.resize(numSIMDs);
  
      for (int i = 0; i < numSIMDs; ++i) {
-        lastVaddrPhase[i] = LastVaddrWave();
+        lastVaddrSimd[i].resize(wfSize(), 0);
      }
  
-    lastVaddrCU = LastVaddrWave();
+    lastVaddrCU.resize(wfSize());
  
      lds.setParent(this);
  
@@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
          fatal("Invalid WF execution policy (CU)\n");
      }
  
-    memPort.resize(VSZ);
+    memPort.resize(wfSize());
  
      // resize the tlbPort vectorArray
-    int tlbPort_width = perLaneTLB ? VSZ : 1;
+    int tlbPort_width = perLaneTLB ? wfSize() : 1;
      tlbPort.resize(tlbPort_width);
  
      cuExitCallback = new CUExitCallback(this);
@@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
  ComputeUnit::~ComputeUnit()
  {
      // Delete wavefront slots
-
-    for (int j = 0; j < numSIMDs; ++j)
+    for (int j = 0; j < numSIMDs; ++j) {
          for (int i = 0; i < shader->n_wf; ++i) {
              delete wfList[j][i];
          }
-
+        lastVaddrSimd[j].clear();
+    }
+    lastVaddrCU.clear();
      readyList.clear();
      waveStatusList.clear();
      dispatchList.clear();
@@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
      VectorMask init_mask;
      init_mask.reset();
  
-    for (int k = 0; k < VSZ; ++k) {
-        if (k + cnt * VSZ < trueWgSizeTotal)
+    for (int k = 0; k < wfSize(); ++k) {
+        if (k + cnt * wfSize() < trueWgSizeTotal)
              init_mask[k] = 1;
      }
  
      wfCtx->init_mask = init_mask.to_ullong();
      wfCtx->exec_mask = init_mask.to_ullong();
  
-    for (int i = 0; i < VSZ; ++i) {
-        wfCtx->bar_cnt[i] = 0;
-    }
+    wfCtx->bar_cnt.resize(wfSize(), 0);
  
      wfCtx->max_bar_cnt = 0;
      wfCtx->old_barrier_cnt = 0;
      wfCtx->barrier_cnt = 0;
  
      wfCtx->privBase = ndr->q.privMemStart;
-    ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
  
      wfCtx->spillBase = ndr->q.spillMemStart;
-    ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
  
      wfCtx->pc = 0;
      wfCtx->rpc = UINT32_MAX;
@@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
      w->dynwaveid = cnt;
      w->init_mask = wfCtx->init_mask;
  
-    for (int k = 0; k < VSZ; ++k) {
-        w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
-        w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
-        w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+    for (int k = 0; k < wfSize(); ++k) {
+        w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
+        w->workitemid[1][k] =
+            ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
+        w->workitemid[2][k] =
+            (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
  
          w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
              trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
@@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
  
      w->old_barrier_cnt = wfCtx->old_barrier_cnt;
      w->barrier_cnt = wfCtx->barrier_cnt;
-    w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+    w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
  
-    for (int i = 0; i < VSZ; ++i) {
+    for (int i = 0; i < wfSize(); ++i) {
          w->bar_cnt[i] = wfCtx->bar_cnt[i];
      }
  
@@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
      // is this the last wavefront in the workgroup
      // if set the spillWidth to be the remaining work-items
      // so that the vector access is correct
-    if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
-        w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+    if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
+        w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
      } else {
-        w->spillWidth = VSZ;
+        w->spillWidth = wfSize();
      }
  
      DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
              "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
  
      w->start(++_n_wave, ndr->q.code_ptr);
+    wfCtx->bar_cnt.clear();
  }
  
  void
@@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
      // Send L1 cache acquire
      // isKernel + isAcquire = Kernel Begin
      if (shader->impl_kern_boundary_sync) {
-        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
                                                                  nullptr,
                                                                  nullptr, 0);
  
@@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
          if (w->status == Wavefront::S_STOPPED) {
              // if we have scheduled all work items then stop
              // scheduling wavefronts
-            if (cnt * VSZ >= trueWgSizeTotal)
+            if (cnt * wfSize() >= trueWgSizeTotal)
                  break;
  
              // reserve vector registers for the scheduled wavefront
@@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr)
      // work item of the work group
      int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
      bool vregAvail = true;
-    int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+    int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
      int freeWfSlots = 0;
      // check if the total number of VGPRs required by all WFs of the WG
      // fit in the VRFs of all SIMD units
@@ -623,7 +639,7 @@ ComputeUnit::init()
      // Setup space for call args
      for (int j = 0; j < numSIMDs; ++j) {
          for (int i = 0; i < shader->n_wf; ++i) {
-            wfList[j][i]->initCallArgMem(shader->funcargs_size);
+            wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
          }
      }
  
@@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
          Addr last = 0;
  
          switch(computeUnit->prefetchType) {
-          case Enums::PF_CU:
+        case Enums::PF_CU:
              last = computeUnit->lastVaddrCU[mp_index];
              break;
-          case Enums::PF_PHASE:
-            last = computeUnit->lastVaddrPhase[simdId][mp_index];
+        case Enums::PF_PHASE:
+            last = computeUnit->lastVaddrSimd[simdId][mp_index];
              break;
-          case Enums::PF_WF:
+        case Enums::PF_WF:
              last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
-          default:
+        default:
              break;
          }
  
@@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
          DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
  
          computeUnit->lastVaddrCU[mp_index] = vaddr;
-        computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+        computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
          computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
  
          stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
@@ -1488,7 +1504,7 @@ ComputeUnit::regStats()
          ;
  
      ldsBankConflictDist
-       .init(0, VSZ, 2)
+       .init(0, wfSize(), 2)
         .name(name() + ".lds_bank_conflicts")
         .desc("Number of bank conflicts per LDS memory packet")
         ;
@@ -1499,27 +1515,28 @@ ComputeUnit::regStats()
          ;
  
      pageDivergenceDist
-       // A wavefront can touch 1 to VSZ pages per memory instruction.
-       // The number of pages per bin can be configured (here it's 4).
-       .init(1, VSZ, 4)
+        // A wavefront can touch up to N pages per memory instruction where
+        // N is equal to the wavefront size
+        // The number of pages per bin can be configured (here it's 4).
+       .init(1, wfSize(), 4)
         .name(name() + ".page_divergence_dist")
         .desc("pages touched per wf (over all mem. instr.)")
         ;
  
      controlFlowDivergenceDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
          .name(name() + ".warp_execution_dist")
          .desc("number of lanes active per instruction (oval all instructions)")
          ;
  
      activeLanesPerGMemInstrDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
          .name(name() + ".gmem_lanes_execution_dist")
          .desc("number of active lanes per global memory instruction")
          ;
  
      activeLanesPerLMemInstrDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
          .name(name() + ".lmem_lanes_execution_dist")
          .desc("number of active lanes per local memory instruction")
          ;
@@ -1531,7 +1548,7 @@ ComputeUnit::regStats()
  
      numVecOpsExecuted
          .name(name() + ".num_vec_ops_executed")
-        .desc("number of vec ops executed (e.g. VSZ/inst)")
+        .desc("number of vec ops executed (e.g. WF size/inst)")
          ;
  
      totalCycles
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh

index f47c27a0a4c699483b65c1e027d9d4657f2815c0..a234cbeb56886ab1f8b0285c36c4f37181ba8e16 100644 (file)
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -161,22 +161,8 @@ class ComputeUnit : public MemObject
      // if fixed-stride prefetching, this is the stride.
      int prefetchStride;
  
-    class LastVaddrWave
-    {
-      public:
-        Addr vaddrs[VSZ];
-        Addr& operator[](int idx) {
-            return vaddrs[idx];
-        }
-
-        LastVaddrWave() {
-            for (int i = 0; i < VSZ; ++i)
-                vaddrs[i] = 0;
-        }
-    };
-
-    LastVaddrWave lastVaddrCU;
-    std::vector<LastVaddrWave> lastVaddrPhase;
+    std::vector<Addr> lastVaddrCU;
+    std::vector<std::vector<Addr>> lastVaddrSimd;
      std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
      Enums::PrefetchType prefetchType;
      EXEC_POLICY exec_policy;
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc

index 95c0c56a20eb878a66b31c9d2bd5c9224d37f067..d1d011c0dc66478c2541ddfa3e8cbca62b698c7e 100644 (file)
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -387,6 +387,12 @@ GpuDispatcher::getNumCUs()
      return shader->cuList.size();
  }
  
+int
+GpuDispatcher::wfSize() const
+{
+    return shader->cuList[0]->wfSize();
+}
+
  void
  GpuDispatcher::setFuncargsSize(int funcargs_size)
  {
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh

index 76f93265549d66ef8afe39a3b0a3af73d8271357..e984af494dc7c541ead46ef3ad8a21f0d6c542bb 100644 (file)
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -157,6 +157,7 @@ class GpuDispatcher : public DmaDevice
  
          // helper functions to retrieve/set GPU attributes
          int getNumCUs();
+        int wfSize() const;
          void setFuncargsSize(int funcargs_size);
  };
  
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc

index 3550186663d45fcb43539f229c4375506edbb410..a6a4d86dbe7c861a7751c5013870adcb7aa786cd 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -179,9 +179,9 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
                  int physVgpr = w->remap(dst, sizeof(c0), 1);
                  // save the physical VGPR index
                  regVec.push_back(physVgpr);
-                c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+                c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
  
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                      if (m->exec_mask[i]) {
                          DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                  "$%s%d <- %d global ld done (src = wavefront "
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc

index 2f35a983c527c725808188f677d3dc86857a9c77..1806e79e47ff8926bca365c798e4f292f8ec5502 100644 (file)
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,11 +42,29 @@
  
  GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
                         GPUStaticInst *_staticInst, uint64_t instSeqNum)
-    : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+    : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
+      m_op(Enums::MO_UNDEF),
        memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false),
        statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
  {
-    tlbHitLevel.assign(VSZ, -1);
+    tlbHitLevel.assign(computeUnit()->wfSize(), -1);
+    d_data = new uint8_t[computeUnit()->wfSize() * 16];
+    a_data = new uint8_t[computeUnit()->wfSize() * 8];
+    x_data = new uint8_t[computeUnit()->wfSize() * 8];
+    for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
+        a_data[i] = 0;
+        x_data[i] = 0;
+    }
+    for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+        d_data[i] = 0;
+    }
+}
+
+GPUDynInst::~GPUDynInst()
+{
+    delete[] d_data;
+    delete[] a_data;
+    delete[] x_data;
  }
  
  void
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh

index e44d8f80d94010de0355897ea7b99e33f040f8e8..46774d8675ff734fa7056929a9fe9b5b23f2639a 100644 (file)
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -205,7 +205,7 @@ class GPUDynInst : public GPUExecContext
    public:
      GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
                 uint64_t instSeqNum);
-
+    ~GPUDynInst();
      void execute();
      int numSrcRegOperands();
      int numDstRegOperands();
@@ -226,15 +226,15 @@ class GPUDynInst : public GPUExecContext
      Enums::StorageClassType executedAs();
  
      // The address of the memory operation
-    Addr addr[VSZ];
+    std::vector<Addr> addr;
      Addr pAddr;
  
      // The data to get written
-    uint8_t d_data[VSZ * 16];
+    uint8_t *d_data;
      // Additional data (for atomics)
-    uint8_t a_data[VSZ * 8];
+    uint8_t *a_data;
      // Additional data (for atomics)
-    uint8_t x_data[VSZ * 8];
+    uint8_t *x_data;
      // The execution mask
      VectorMask exec_mask;
  
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc

index 7f919c5f41d5c78c7f6d2ac6607e372eb6d2cc45..a970d8f9b7438dc4133581b4b8e19a28bbf7c92a 100644 (file)
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -148,9 +148,9 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
              int physVgpr = w->remap(dst,sizeof(c0),1);
              // save the physical VGPR index
              regVec.push_back(physVgpr);
-            c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+            c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
  
-            for (int i = 0; i < VSZ; ++i) {
+            for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                  if (m->exec_mask[i]) {
                      // write the value into the physical VGPR. This is a purely
                      // functional operation. No timing is modeled.
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh

index 4f803283290ad3401728b849baaee7b4979c3e21..5ade897891f2a1173250208d317a37a3c6b5ac27 100644 (file)
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@@ -37,28 +37,14 @@
  #define __MISC_HH__
  
  #include <bitset>
+#include <limits>
  #include <memory>
  
  #include "base/misc.hh"
  
  class GPUDynInst;
  
-// wavefront size of the machine
-static const int VSZ = 64;
-
-/*
- This check is necessary because std::bitset only provides conversion to
- unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
- a few places in the code where to_ullong() is used, however if VSZ is larger
- than a value the host can support then bitset will throw a runtime exception.
-
- we should remove all use of to_long() or to_ullong() so we can have VSZ
- greater than 64b, however until that is done this assert is required.
- */
-static_assert(VSZ <= sizeof(unsigned long long) * 8,
-              "VSZ is larger than the host can support");
-
-typedef std::bitset<VSZ> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
  typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
  
  class WaitClass
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh

index 092303c008d9435c5527608f20aef7a7037ddf72..7bca757b83c3bb7eef7270d659db5783235dd284 100644 (file)
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -100,7 +100,7 @@ struct WFContext
  {
      // 32 bit values
      // barrier state
-    int bar_cnt[VSZ];
+    std::vector<int> bar_cnt;
  
      // id (which WF in the WG)
      int cnt;
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc

index 8b7dc0691a6e8115b5f7587ee371f23fb12db074..c43d765afbe97b63829462268261f029acb61bd6 100644 (file)
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -63,7 +63,7 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
      nxtBusy.clear();
      nxtBusy.resize(numRegsPerSimd, 0);
  
-    vgprState->init(numRegsPerSimd);
+    vgprState->init(numRegsPerSimd, p->wfSize);
  }
  
  void
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc

index f231b0579868cfc0f8b4d38c47ca66aff1a7bd5d..e177d3b64985af853e48a71b93eb44d22ba74eed 100644 (file)
--- a/src/gpu-compute/vector_register_state.cc
+++ b/src/gpu-compute/vector_register_state.cc
@@ -35,6 +35,8 @@
  
  #include "gpu-compute/vector_register_state.hh"
  
+#include <limits>
+
  #include "gpu-compute/compute_unit.hh"
  
  VecRegisterState::VecRegisterState() : computeUnit(nullptr)
@@ -51,8 +53,19 @@ VecRegisterState::setParent(ComputeUnit *_computeUnit)
  }
  
  void
-VecRegisterState::init(uint32_t _size)
+VecRegisterState::init(uint32_t _size, uint32_t wf_size)
  {
      s_reg.resize(_size);
+    fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits ||
+             wf_size <= 0,
+             "WF size is larger than the host can support or is zero");
+    fatal_if((wf_size & (wf_size - 1)) != 0,
+             "Wavefront size should be a power of 2");
+    for (int i = 0; i < s_reg.size(); ++i) {
+        s_reg[i].resize(wf_size, 0);
+    }
      d_reg.resize(_size);
+    for (int i = 0; i < d_reg.size(); ++i) {
+        d_reg[i].resize(wf_size, 0);
+    }
  }
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh

index a233b9acce6a118c55c9ec1f17718757dcf6064d..97a0d8e257d86e19f30c77a2073a0caee0b4368d 100644 (file)
--- a/src/gpu-compute/vector_register_state.hh
+++ b/src/gpu-compute/vector_register_state.hh
@@ -51,7 +51,7 @@ class VecRegisterState
  {
    public:
      VecRegisterState();
-    void init(uint32_t _size);
+    void init(uint32_t _size, uint32_t wf_size);
  
      const std::string& name() const { return _name; }
      void setParent(ComputeUnit *_computeUnit);
@@ -93,9 +93,9 @@ class VecRegisterState
      ComputeUnit *computeUnit;
      std::string _name;
      // 32-bit Single Precision Vector Register State
-    std::vector<std::array<uint32_t, VSZ>> s_reg;
+    std::vector<std::vector<uint32_t>> s_reg;
      // 64-bit Double Precision Vector Register State
-    std::vector<std::array<uint64_t, VSZ>> d_reg;
+    std::vector<std::vector<uint64_t>> d_reg;
  };
  
  #endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc

index 7cdec53e573f46c17e2d4dff139095cd9ed56a10..a20330082c6bd325618b5a058448e0d07d768b12 100644 (file)
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -55,7 +55,6 @@ Wavefront::Wavefront(const Params *p)
      last_trace = 0;
      simdId = p->simdId;
      wfSlotId = p->wf_slot_id;
-
      status = S_STOPPED;
      reservedVectorRegs = 0;
      startVgprIndex = 0;
@@ -77,12 +76,20 @@ Wavefront::Wavefront(const Params *p)
      mem_trace_busy = 0;
      old_vgpr_tcnt = 0xffffffffffffffffll;
      old_dgpr_tcnt = 0xffffffffffffffffll;
+    old_vgpr.resize(p->wfSize);
  
      pendingFetch = false;
      dropFetch = false;
      condRegState = new ConditionRegisterState();
      maxSpVgprs = 0;
      maxDpVgprs = 0;
+    last_addr.resize(p->wfSize);
+    workitemFlatId.resize(p->wfSize);
+    old_dgpr.resize(p->wfSize);
+    bar_cnt.resize(p->wfSize);
+    for (int i = 0; i < 3; ++i) {
+        workitemid[i].resize(p->wfSize);
+    }
  }
  
  void
@@ -144,6 +151,7 @@ Wavefront::~Wavefront()
  {
      if (callArgMem)
          delete callArgMem;
+    delete condRegState;
  }
  
  void
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh

index 0abab8e839b894a41d6ba3edfb50783ea7be3a9c..5a5386a3d4cedd93dd194c81d991e0cd5028ba97 100644 (file)
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -83,6 +83,7 @@ class CallArgMem
    public:
      // pointer to buffer for storing function arguments
      uint8_t *mem;
+    int wfSize;
      // size of function args
      int funcArgsSizePerItem;
  
@@ -90,13 +91,13 @@ class CallArgMem
      int
      getLaneOffset(int lane, int addr)
      {
-        return addr * VSZ + sizeof(CType) * lane;
+        return addr * wfSize + sizeof(CType) * lane;
      }
  
-    CallArgMem(int func_args_size_per_item)
-      : funcArgsSizePerItem(func_args_size_per_item)
+    CallArgMem(int func_args_size_per_item, int wf_size)
+        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
      {
-        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
      }
  
      ~CallArgMem()
@@ -192,9 +193,9 @@ class Wavefront : public SimObject
      bool isOldestInstALU();
      bool isOldestInstBarrier();
      // used for passing spill address to DDInstGPU
-    uint64_t last_addr[VSZ];
-    uint32_t workitemid[3][VSZ];
-    uint32_t workitemFlatId[VSZ];
+    std::vector<Addr> last_addr;
+    std::vector<uint32_t> workitemid[3];
+    std::vector<uint32_t> workitemFlatId;
      uint32_t workgroupid[3];
      uint32_t workgroupsz[3];
      uint32_t gridsz[3];
@@ -230,14 +231,14 @@ class Wavefront : public SimObject
      uint32_t startVgprIndex;
  
      // Old value of destination gpr (for trace)
-    uint32_t old_vgpr[VSZ];
+    std::vector<uint32_t> old_vgpr;
      // Id of destination gpr (for trace)
      uint32_t old_vgpr_id;
      // Tick count of last old_vgpr copy
      uint64_t old_vgpr_tcnt;
  
      // Old value of destination gpr (for trace)
-    uint64_t old_dgpr[VSZ];
+    std::vector<uint64_t> old_dgpr;
      // Id of destination gpr (for trace)
      uint32_t old_dgpr_id;
      // Tick count of last old_vgpr copy
@@ -247,7 +248,7 @@ class Wavefront : public SimObject
      VectorMask init_mask;
  
      // number of barriers this WF has joined
-    int bar_cnt[VSZ];
+    std::vector<int> bar_cnt;
      int max_bar_cnt;
      // Flag to stall a wave on barrier
      bool stalledAtBarrier;
@@ -296,9 +297,9 @@ class Wavefront : public SimObject
      // argument memory for hsail call instruction
      CallArgMem *callArgMem;
      void
-    initCallArgMem(int func_args_size_per_item)
+    initCallArgMem(int func_args_size_per_item, int wf_size)
      {
-        callArgMem = new CallArgMem(func_args_size_per_item);
+        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
      }
  
      template<typename CType>
@@ -327,7 +328,6 @@ class Wavefront : public SimObject
      }
  
      void start(uint64_t _wfDynId, uint64_t _base_ptr);
-
      void exec();
      void updateResources();
      int ready(itype_e type);
author	jkalamat <john.kalamatianos@amd.com>
	Thu, 9 Jun 2016 15:24:55 +0000 (11:24 -0400)
committer	jkalamat <john.kalamatianos@amd.com>
	Thu, 9 Jun 2016 15:24:55 +0000 (11:24 -0400)
configs/example/apu_se.py		patch \| blob \| history
src/arch/hsail/gen.py		patch \| blob \| history
src/arch/hsail/insts/branch.hh		patch \| blob \| history
src/arch/hsail/insts/main.cc		patch \| blob \| history
src/arch/hsail/insts/mem.hh		patch \| blob \| history
src/arch/hsail/insts/mem_impl.hh		patch \| blob \| history
src/arch/hsail/insts/pseudo_inst.cc		patch \| blob \| history
src/arch/hsail/operand.hh		patch \| blob \| history
src/gpu-compute/GPU.py		patch \| blob \| history
src/gpu-compute/cl_driver.cc		patch \| blob \| history
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/compute_unit.hh		patch \| blob \| history
src/gpu-compute/dispatcher.cc		patch \| blob \| history
src/gpu-compute/dispatcher.hh		patch \| blob \| history
src/gpu-compute/global_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/gpu_dyn_inst.cc		patch \| blob \| history
src/gpu-compute/gpu_dyn_inst.hh		patch \| blob \| history
src/gpu-compute/local_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/misc.hh		patch \| blob \| history
src/gpu-compute/qstruct.hh		patch \| blob \| history
src/gpu-compute/vector_register_file.cc		patch \| blob \| history
src/gpu-compute/vector_register_state.cc		patch \| blob \| history
src/gpu-compute/vector_register_state.hh		patch \| blob \| history
src/gpu-compute/wavefront.cc		patch \| blob \| history
src/gpu-compute/wavefront.hh		patch \| blob \| history