- PKE simulation code almost complete. Still missing:
authorFrank Ch. Eigler <fche@redhat.com>
Wed, 11 Feb 1998 19:42:15 +0000 (19:42 +0000)
committerFrank Ch. Eigler <fche@redhat.com>
Wed, 11 Feb 1998 19:42:15 +0000 (19:42 +0000)
  * handling of super duper packed UNPACK arguments
  * skipping of in-progress instruction on break/stop
  * interrupt generation to 5900
  * PATH2/PATH3 status checking & masking
  * ability to write to FIFO one word (instead of quadword) at a time

sim/mips/sky-pke.c
sim/mips/sky-pke.h

index f46873bfc7a93aa9e5c0929275eecc2918be87f9..eb64cd32c7b9f19ed4ca287f77b6cb4f4871c227 100644 (file)
@@ -1,5 +1,8 @@
 /* Copyright (C) 1998, Cygnus Solutions */
 
+/* Debugguing PKE? */
+#define PKE_DEBUG 
+
 #include <stdlib.h>
 #include "sky-pke.h"
 #include "sky-dma.h"
@@ -8,6 +11,7 @@
 #include "sky-vu1.h"
 #include "sky-gpuif.h"
 
+
 /* Imported functions */
 
 void device_error (device *me, char* message);  /* device.c */
@@ -21,11 +25,38 @@ static int pke_io_write_buffer(device*, const void*, int, address_word,
                               unsigned, sim_cpu*, sim_cia);
 static void pke_issue(struct pke_device*);
 static void pke_pc_advance(struct pke_device*, int num_words);
-static unsigned_4* pke_pc_operand(struct pke_device*, int word_num);
-static struct fifo_quadword* pke_pc_fifo(struct pke_device*, int word_num);
+static unsigned_4* pke_pc_operand(struct pke_device*, int operand_num);
+static struct fifo_quadword* pke_pc_fifo(struct pke_device*, int operand_num, 
+                                        unsigned_4** operand);
 static int pke_track_write(struct pke_device*, const void* src, int len,
                           address_word dest, unsigned_4 sourceaddr);
 static void pke_attach(SIM_DESC sd, struct pke_device* me);
+enum pke_check_target { chk_vu, chk_path1, chk_path2, chk_path3 };
+static int pke_check_stall(struct pke_device* me, enum pke_check_target what);
+static void pke_flip_dbf(struct pke_device* me);
+/* PKEcode handlers */
+static void pke_code_nop(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_stcycl(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_offset(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_base(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_itop(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_stmod(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_mskpath3(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_pkemark(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_flushe(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_flush(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_flusha(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_pkemscal(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_pkemscnt(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_pkemscalf(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_stmask(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_strow(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_stcol(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_mpg(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_direct(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_directhl(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_unpack(struct pke_device* me, unsigned_4 pkecode);
+static void pke_code_error(struct pke_device* me, unsigned_4 pkecode);
 
 
 
@@ -292,26 +323,8 @@ pke_io_write_buffer(device *me_,
       switch(reg_num)
        {
        case PKE_REG_FBRST:
-         /* XXX: order of evaluation?  STP && STC ?? */
-         if(BIT_MASK_GET(input[0], 0, 0)) /* RST bit */
-           {
-             /* clear FIFO: also prevents re-execution attempt of
-                 possible stalled instruction */
-             me->fifo_num_elements = me->fifo_pc;
-             /* clear registers */
-             memset(me->regs, 0, sizeof(me->regs));
-             me->flags = 0;
-             me->qw_pc = 0;
-           }
-         if(BIT_MASK_GET(input[0], 1, 1)) /* FBK bit */
-           {
-             PKE_REG_MASK_SET(me, STAT, PFS, 1);
-           }
-         if(BIT_MASK_GET(input[0], 2, 2)) /* STP bit */
-           {
-             /* XXX: how to safely abort "currently executing" (=> stalled) instruction? */
-             PKE_REG_MASK_SET(me, STAT, PSS, 1);
-           }
+         /* Order these tests from least to most overriding, in case
+             multiple bits are set. */
          if(BIT_MASK_GET(input[0], 2, 2)) /* STC bit */
            {
              /* clear a bunch of status bits */
@@ -323,6 +336,26 @@ pke_io_write_buffer(device *me_,
              PKE_REG_MASK_SET(me, STAT, ER1, 0);
              /* will allow resumption of possible stalled instruction */
            }
+         if(BIT_MASK_GET(input[0], 2, 2)) /* STP bit */
+           {
+             /* XXX: how to safely abort "currently executing" (=> stalled) instruction? */
+             PKE_REG_MASK_SET(me, STAT, PSS, 1);
+           }
+         if(BIT_MASK_GET(input[0], 1, 1)) /* FBK bit */
+           {
+             PKE_REG_MASK_SET(me, STAT, PFS, 1);
+           }
+         if(BIT_MASK_GET(input[0], 0, 0)) /* RST bit */
+           {
+             /* clear FIFO by skipping to word after PC: also
+                 prevents re-execution attempt of possible stalled
+                 instruction */
+             me->fifo_num_elements = me->fifo_pc;
+             /* clear registers */
+             memset(me->regs, 0, sizeof(me->regs));
+             me->flags = 0;
+             me->qw_pc = 0;
+           }
          break;
 
        case PKE_REG_ERR:
@@ -424,7 +457,6 @@ pke_io_write_buffer(device *me_,
               (SIM_ADDR) (me->pke_number == 0 ? DMA_CHANNEL0_PKTFLAG : DMA_CHANNEL1_PKTFLAG),
               (void*) & fqw->dma_tag_present,
               sizeof(unsigned_4));
-      /* XXX: check RC */
 
       me->fifo_num_elements++;
 
@@ -450,20 +482,21 @@ pke_issue(struct pke_device* me)
   unsigned_4 fw;
   unsigned_4 cmd, intr, num;
   unsigned_4 imm;
-  int next_pps_state; /* PPS after this instruction issue attempt */
 
   /* 1 -- test go / no-go for PKE execution */
 
   /* check for stall/halt control bits */
-  /* XXX: What is the PEW bit for? */
-  if(PKE_REG_MASK_GET(me, STAT, PSS) ||
+  if(PKE_REG_MASK_GET(me, STAT, PSS) || /* XXX: PSS may be a special case */
      PKE_REG_MASK_GET(me, STAT, PFS) ||
+     /* PEW bit not a reason to keep stalling - it's re-checked below */
+     /* PGW bit not a reason to keep stalling - it's re-checked below */
      /* maskable stall controls: ER0, ER1, PIS */
      (PKE_REG_MASK_GET(me, STAT, ER0) && !PKE_REG_MASK_GET(me, ERR, ME0)) ||
      (PKE_REG_MASK_GET(me, STAT, ER1) && !PKE_REG_MASK_GET(me, ERR, ME1)) ||
      (PKE_REG_MASK_GET(me, STAT, PIS) && !PKE_REG_MASK_GET(me, ERR, MII)))
     {
-      /* XXX */
+      /* try again next cycle; no state change */
+      return;
     }
   /* XXX: handle PSS by *skipping* instruction? */
 
@@ -474,21 +507,14 @@ pke_issue(struct pke_device* me)
 
   /* 2 -- fetch PKE instruction */
 
-  /* "fetch" instruction quadword */ 
-  fqw = & me->fifo[me->fifo_pc];
+  /* skip over DMA tag, if present */
+  pke_pc_advance(me, 0);
 
-  /* skip over DMA tags, if present */
-  if((fqw->dma_tag_present != 0) && (me->qw_pc < 2))
-    {
-      ASSERT(me->qw_pc == 0);
-      /* XXX: check validity of DMA tag; if bad, set ER0 flag */
-      me->qw_pc = 2;
-    }
-  
-  /* "fetch" instruction word */
+  /* "fetch" instruction quadword and word */ 
+  fqw = & me->fifo[me->fifo_pc];
   fw = fqw->data[me->qw_pc];
 
-  /* store it in PKECODE register */
+  /* store word in PKECODE register */
   me->regs[PKE_REG_CODE][0] = fw;
 
 
@@ -510,894 +536,1106 @@ pke_issue(struct pke_device* me)
 
   /* decoding */
   PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_DECODE);
-  next_pps_state = PKE_REG_STAT_PPS_IDLE;  /* assume instruction completes */
 
-  /* decode */
+  /* decode & execute */
   if(IS_PKE_CMD(cmd, PKENOP))
-    {
-      /* no work required, yey */
-      pke_pc_advance(me, 1);
-    }
+    pke_code_nop(me, fw);
   else if(IS_PKE_CMD(cmd, STCYCL))
-    {
-      /* copy immediate value into CYCLE reg */
-      me->regs[PKE_REG_CYCLE][0] = imm;
-      pke_pc_advance(me, 1);
-    }
+    pke_code_stcycl(me, fw);
   else if(me->pke_number == 1 && IS_PKE_CMD(cmd, OFFSET))
-    {
-      /* copy 10 bits to OFFSET field */
-      PKE_REG_MASK_SET(me, OFST, OFFSET, BIT_MASK_GET(imm, 0, 9));
-      /* clear DBF bit */
-      PKE_REG_MASK_SET(me, DBF, DF, 0);
-      /* clear other DBF bit */
-      PKE_REG_MASK_SET(me, STAT, DBF, 0);
-      /* set TOPS = BASE */
-      PKE_REG_MASK_SET(me, TOPS, TOPS,
-                      PKE_REG_MASK_GET(me, BASE, BASE));
-      pke_pc_advance(me, 1);
-    }
+    pke_code_offset(me, fw);
   else if(me->pke_number == 1 && IS_PKE_CMD(cmd, BASE))
-    {
-      /* copy 10 bits to BASE field */
-      PKE_REG_MASK_SET(me, BASE, BASE, BIT_MASK_GET(imm, 0, 9));
-      /* clear DBF bit */
-      PKE_REG_MASK_SET(me, DBF, DF, 0);
-      /* clear other DBF bit */
-      PKE_REG_MASK_SET(me, STAT, DBF, 0);
-      /* set TOPS = BASE */
-      PKE_REG_MASK_SET(me, TOPS, TOPS,
-                      PKE_REG_MASK_GET(me, BASE, BASE));
-      pke_pc_advance(me, 1);
-    }
+    pke_code_base(me, fw);
   else if(IS_PKE_CMD(cmd, ITOP))
-    {
-      /* copy 10 bits to ITOPS field */
-      PKE_REG_MASK_SET(me, ITOPS, ITOPS, BIT_MASK_GET(imm, 0, 9));
-      pke_pc_advance(me, 1);
-    }
+    pke_code_itop(me, fw);
   else if(IS_PKE_CMD(cmd, STMOD))
-    {
-      /* copy 2 bits to MODE register */
-      PKE_REG_MASK_SET(me, MODE, MDE, BIT_MASK_GET(imm, 0, 2));
-      pke_pc_advance(me, 1);
-    }
-  else if(me->pke_number == 1 && IS_PKE_CMD(cmd, MSKPATH3)) /* MSKPATH3 */
-    {
-      /* XXX: what to do with this?  DMA control register? */
-      pke_pc_advance(me, 1);
-    }
+    pke_code_stmod(me, fw);
+  else if(me->pke_number == 1 && IS_PKE_CMD(cmd, MSKPATH3))
+    pke_code_mskpath3(me, fw);
   else if(IS_PKE_CMD(cmd, PKEMARK))
-    {
-      /* copy 16 bits to MARK register */
-      PKE_REG_MASK_SET(me, MARK, MARK, BIT_MASK_GET(imm, 0, 15));
-      /* set MRK bit in STAT register - CPU2 v2.1 docs incorrect */
-      PKE_REG_MASK_SET(me, STAT, MRK, 1);
-      pke_pc_advance(me, 1);
-    }
+    pke_code_pkemark(me, fw);
   else if(IS_PKE_CMD(cmd, FLUSHE))
-    {
-      /* read VU status word */
-      unsigned_4 vu_stat;
-      sim_read(NULL,
-              (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
-              (void*) & vu_stat,
-              sizeof(unsigned_4));
-      /* XXX: check RC */
-
-      /* check if VBS bit is clear, i.e., VU is idle */
-      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 0)
-       {
-         /* VU idle */
-         /* advance PC */
-         pke_pc_advance(me, 1);
-       }
-      else
-       {
-         /* VU busy */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
-       }
-    }
+    pke_code_flushe(me, fw);
   else if(me->pke_number == 1 && IS_PKE_CMD(cmd, FLUSH))
+    pke_code_flush(me, fw);
+  else if(me->pke_number == 1 && IS_PKE_CMD(cmd, FLUSHA))
+    pke_code_flusha(me, fw);
+  else if(IS_PKE_CMD(cmd, PKEMSCAL))
+    pke_code_pkemscal(me, fw);
+  else if(IS_PKE_CMD(cmd, PKEMSCNT))
+    pke_code_pkemscnt(me, fw);
+  else if(me->pke_number == 1 && IS_PKE_CMD(cmd, PKEMSCALF))
+    pke_code_pkemscalf(me, fw);
+  else if(IS_PKE_CMD(cmd, STMASK))
+    pke_code_stmask(me, fw);
+  else if(IS_PKE_CMD(cmd, STROW))
+    pke_code_strow(me, fw);
+  else if(IS_PKE_CMD(cmd, STCOL))
+    pke_code_stcol(me, fw);
+  else if(IS_PKE_CMD(cmd, MPG))
+    pke_code_mpg(me, fw);
+  else if(IS_PKE_CMD(cmd, DIRECT))
+    pke_code_direct(me, fw);
+  else if(IS_PKE_CMD(cmd, DIRECTHL))
+    pke_code_directhl(me, fw);
+  else if(IS_PKE_CMD(cmd, UNPACK))
+    pke_code_unpack(me, fw);
+  /* ... other commands ... */
+  else
+    pke_code_error(me, fw);
+}
+
+
+
+/* advance the PC by given number of data words; update STAT/FQC
+   field; assume FIFO is filled enough */
+
+void
+pke_pc_advance(struct pke_device* me, int num_words)
+{
+  int num = num_words;
+  ASSERT(num_words > 0);
+
+  while(num > 0)
     {
-      /* read VU status word */
-      unsigned_4 vu_stat;
-      sim_read(NULL,
-              (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
-              (void*) & vu_stat,
-              sizeof(unsigned_4));
-      /* XXX: check RC */
+      struct fifo_quadword* fq;
+
+      /* one word skipped */
+      num --;
 
-      /* check if VGW bit is clear, i.e., PATH1 is idle */
-      /* simulator design implies PATH2 is always "idle" */
-      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 0 &&
-        BIT_MASK_GET(vu_stat, VU_REG_STAT_VGW_B, VU_REG_STAT_VGW_E) == 0 &&
-        1 /* PATH2 always idle */)
+      /* point to next word */
+      me->qw_pc ++;
+      if(me->qw_pc == 4)
        {
-         /* VU idle */
-         /* PATH1 idle */
-         /* PATH2 idle */
-         /* advance PC */
-         pke_pc_advance(me, 1);
+         me->qw_pc = 0;
+         me->fifo_pc ++;
        }
-      else
+
+      /* skip over DMA tag words if present in word 0 or 1 */
+      fq = & me->fifo[me->fifo_pc];
+      if(fq->dma_tag_present && (me->qw_pc < 2))
        {
-         /* GPUIF busy */
-         /* retry this instruction next clock */
+         /* skip by going around loop an extra time */
+         num ++;
        }
     }
-  else if(me->pke_number == 1 && IS_PKE_CMD(cmd, FLUSHA))
+
+  /* clear FQC if FIFO is now empty */ 
+  if(me->fifo_num_elements == me->fifo_pc)
     {
-      /* read VU status word */
-      unsigned_4 vu_stat;
-      sim_read(NULL,
-              (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
-              (void*) & vu_stat,
-              sizeof(unsigned_4));
-      /* XXX: check RC */
-
-      /* check if VGW bit is clear, i.e., PATH1 is idle */
-      /* simulator design implies PATH2 is always "idle" */
-      /* XXX: simulator design implies PATH3 is always "idle" */
-      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 0 &&
-        BIT_MASK_GET(vu_stat, VU_REG_STAT_VGW_B, VU_REG_STAT_VGW_E) == 0 &&
-        1 /* PATH2 always idle */ &&
-         1 /* PATH3 always idle */)
-       {
-         /* VU idle */
-         /* PATH1 idle */
-         /* PATH2 idle */
-         /* PATH3 idle */
-         /* advance PC */
-         pke_pc_advance(me, 1);
-       }
-      else
-       {
-         /* GPUIF busy */
-         /* retry this instruction next clock */
-       }
+      PKE_REG_MASK_SET(me, STAT, FQC, 0);
     }
-  else if(IS_PKE_CMD(cmd, PKEMSCAL))
-    {
-      /* read VU status word */
-      unsigned_4 vu_stat;
-      sim_read(NULL,
-              (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
-              (void*) & vu_stat,
-              sizeof(unsigned_4));
-      /* XXX: check RC */
-
-      /* check if VBS bit is clear, i.e., VU is idle */
-      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 0)
-       {
-         /* VU idle */
-         unsigned_4 vu_pc;
+}
 
-         /* perform PKE1-unique processing for microprogram calls */
-         if(me->pke_number == 1)
-           {
-             /* flip DBF */
-             PKE_REG_MASK_SET(me, DBF, DF,
-                              PKE_REG_MASK_GET(me, DBF, DF) ? 0 : 1);
-             PKE_REG_MASK_SET(me, STAT, DBF, PKE_REG_MASK_GET(me, DBF, DF));
-             /* compute new TOPS */
-             PKE_REG_MASK_SET(me, TOPS, TOPS,
-                              (PKE_REG_MASK_GET(me, BASE, BASE) +
-                               (PKE_REG_MASK_GET(me, DBF, DF) *
-                                PKE_REG_MASK_GET(me, OFST, OFFSET))));
-             /* compute new ITOP and TOP */
-             PKE_REG_MASK_SET(me, ITOP, ITOP,
-                              PKE_REG_MASK_GET(me, ITOPS, ITOPS));
-             PKE_REG_MASK_SET(me, TOP, TOP,
-                              PKE_REG_MASK_GET(me, TOPS, TOPS));
-           }
 
-         /* compute new PC */
-         vu_pc = BIT_MASK_GET(imm, 0, 15); /* XXX: all bits significant? */
-         /* write new PC; callback function gets VU running */
-         sim_write(NULL,
-                   (SIM_ADDR) (me->pke_number == 0 ? VU0_PC_START : VU1_PC_START),
-                   (void*) & vu_pc,
-                   sizeof(unsigned_4));
-         /* advance PC */
-         pke_pc_advance(me, 1);
-       }
-      else
-       {
-         /* VU busy */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
-       }
-    }
-  else if(IS_PKE_CMD(cmd, PKEMSCNT))
-    {
-      /* read VU status word */
-      unsigned_4 vu_stat;
-      sim_read(NULL,
-              (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
-              (void*) & vu_stat,
-              sizeof(unsigned_4));
-      /* XXX: check RC */
 
-      /* check if VBS bit is clear, i.e., VU is idle */
-      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 0)
-       {
-         /* VU idle */
-         unsigned_4 vu_pc;
+/* Return pointer to FIFO quadword containing given operand# in FIFO.
+   `operand_num' starts at 1.  Return pointer to operand word in last
+   argument, if non-NULL.  If FIFO is not full enough, return 0.
+   Signal an ER0 indication upon skipping a DMA tag.  */
 
-         /* flip DBF etc. for PKE1 */
-         if(me->pke_number == 1)
-           {
-             PKE_REG_MASK_SET(me, DBF, DF,
-                              PKE_REG_MASK_GET(me, DBF, DF) ? 0 : 1);
-             PKE_REG_MASK_SET(me, STAT, DBF, PKE_REG_MASK_GET(me, DBF, DF));
-             PKE_REG_MASK_SET(me, TOPS, TOPS,
-                              (PKE_REG_MASK_GET(me, BASE, BASE) +
-                               (PKE_REG_MASK_GET(me, DBF, DF) *
-                                PKE_REG_MASK_GET(me, OFST, OFFSET))));
-             PKE_REG_MASK_SET(me, ITOP, ITOP,
-                              PKE_REG_MASK_GET(me, ITOPS, ITOPS));
-             PKE_REG_MASK_SET(me, TOP, TOP,
-                              PKE_REG_MASK_GET(me, TOPS, TOPS));
-           }
+struct fifo_quadword*
+pke_pc_fifo(struct pke_device* me, int operand_num, unsigned_4** operand)
+{
+  int num = operand_num;
+  int new_qw_pc, new_fifo_pc;
+  struct fifo_quadword* operand_fifo;
 
-         /* read old PC */
-         sim_read(NULL,
-                  (SIM_ADDR) (me->pke_number == 0 ? VU0_PC_START : VU1_PC_START),
-                  (void*) & vu_pc,
-                  sizeof(unsigned_4));
-         /* rewrite its PC; callback function gets VU running */
-         sim_write(NULL,
-                   (SIM_ADDR) (me->pke_number == 0 ? VU0_PC_START : VU1_PC_START),
-                   (void*) & vu_pc,
-                   sizeof(unsigned_4));
-         /* advance PC */
-         pke_pc_advance(me, 1);
-       }
-      else
-       {
-         /* VU busy */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
-       }
-    }
-  else if(me->pke_number == 1 && IS_PKE_CMD(cmd, PKEMSCALF))
-    {
-      /* read VU status word */
-      unsigned_4 vu_stat;
-      sim_read(NULL,
-              (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
-              (void*) & vu_stat,
-              sizeof(unsigned_4));
-      /* XXX: check RC */
+  ASSERT(num > 0);
 
-      /* check if VGW bit is clear, i.e., PATH1 is idle */
-      /* simulator design implies PATH2 is always "idle" */
-      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 0 &&
-        BIT_MASK_GET(vu_stat, VU_REG_STAT_VGW_B, VU_REG_STAT_VGW_E) == 0 &&
-        1 /* PATH2 always idle */)
-       {
-         /* VU idle */
-         /* PATH1 idle */
-         /* PATH2 idle */
-         unsigned_4 vu_pc;
+  /* snapshot current pointers */
+  new_fifo_pc = me->fifo_pc;
+  new_qw_pc = me->qw_pc;
 
-         /* flip DBF etc. for PKE1 */
-         if(me->pke_number == 1)
-           {
-             PKE_REG_MASK_SET(me, DBF, DF,
-                              PKE_REG_MASK_GET(me, DBF, DF) ? 0 : 1);
-             PKE_REG_MASK_SET(me, STAT, DBF, PKE_REG_MASK_GET(me, DBF, DF));
-             PKE_REG_MASK_SET(me, TOPS, TOPS,
-                              (PKE_REG_MASK_GET(me, BASE, BASE) +
-                               (PKE_REG_MASK_GET(me, DBF, DF) *
-                                PKE_REG_MASK_GET(me, OFST, OFFSET))));
-             PKE_REG_MASK_SET(me, ITOP, ITOP,
-                              PKE_REG_MASK_GET(me, ITOPS, ITOPS));
-             PKE_REG_MASK_SET(me, TOP, TOP,
-                              PKE_REG_MASK_GET(me, TOPS, TOPS));
-           }
+  while(num > 0)
+    {
+      /* one word skipped */
+      num --;
 
-         /* compute new PC */
-         vu_pc = BIT_MASK_GET(imm, 0, 15); /* XXX: all bits significant? */
-         /* write new PC; callback function gets VU running */
-         sim_write(NULL,
-                   (SIM_ADDR) (me->pke_number == 0 ? VU0_PC_START : VU1_PC_START),
-                   (void*) & vu_pc,
-                   sizeof(unsigned_4));
-         /* advance PC */
-         pke_pc_advance(me, 1);
-       }
-      else
+      /* point to next word */
+      new_qw_pc ++;
+      if(new_qw_pc == 4)
        {
-         /* VU busy */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
+         new_qw_pc = 0;
+         new_fifo_pc ++;
        }
-    }
-  else if(IS_PKE_CMD(cmd, STMASK))
-    {
-      /* check that FIFO has one more word for STMASK operand */
-      unsigned_4* mask;
 
-      mask = pke_pc_operand(me, 1);
-      if(mask != NULL)
+      /* check for FIFO underflow */
+      if(me->fifo_num_elements == new_fifo_pc)
        {
-         /* "transferring" operand */
-         PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
-         /* fill the register */
-         PKE_REG_MASK_SET(me, MASK, MASK, *mask);
-         /* advance PC */
-         pke_pc_advance(me, 2);
+         operand_fifo = NULL;
+         break;
        }
-      else
+
+      /* skip over DMA tag words if present in word 0 or 1 */
+      operand_fifo = & me->fifo[new_fifo_pc];
+      if(operand_fifo->dma_tag_present && (new_qw_pc < 2))
        {
-         /* need to wait for another word */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
+         /* mismatch error! */
+         PKE_REG_MASK_SET(me, STAT, ER0, 1);
+         /* skip by going around loop an extra time */
+         num ++;
        }
     }
-  else if(IS_PKE_CMD(cmd, STROW))
-    {
-      /* check that FIFO has four more words for STROW operand */
-      unsigned_4* last_op;
 
-      last_op = pke_pc_operand(me, 4);
-      if(last_op != NULL)
-       {
-         /* "transferring" operand */
-         PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
+  /* return pointer to operand word itself */
+  if(operand_fifo != NULL)
+    *operand = & operand_fifo->data[new_qw_pc];
 
-         /* copy ROW registers: must all exist if 4th operand exists */
-         me->regs[PKE_REG_R0][0] = * pke_pc_operand(me, 1);
-         me->regs[PKE_REG_R1][0] = * pke_pc_operand(me, 2);
-         me->regs[PKE_REG_R2][0] = * pke_pc_operand(me, 3);
-         me->regs[PKE_REG_R3][0] = * pke_pc_operand(me, 4);
+  return operand_fifo;
+}
 
-         /* advance PC */
-         pke_pc_advance(me, 5);
-       }
-      else
-       {
-         /* need to wait for another word */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
-       }
-    }
-  else if(IS_PKE_CMD(cmd, STCOL))
-    {
-      /* check that FIFO has four more words for STCOL operand */
-      unsigned_4* last_op;
 
-      last_op = pke_pc_operand(me, 4);
-      if(last_op != NULL)
-       {
-         /* "transferring" operand */
-         PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
+/* Return pointer to given operand# in FIFO.  `operand_num' starts at 1.
+   If FIFO is not full enough, return 0.  Skip over DMA tags, but mark
+   them as an error (ER0). */
 
-         /* copy COL registers: must all exist if 4th operand exists */
-         me->regs[PKE_REG_C0][0] = * pke_pc_operand(me, 1);
-         me->regs[PKE_REG_C1][0] = * pke_pc_operand(me, 2);
-         me->regs[PKE_REG_C2][0] = * pke_pc_operand(me, 3);
-         me->regs[PKE_REG_C3][0] = * pke_pc_operand(me, 4);
+unsigned_4*
+pke_pc_operand(struct pke_device* me, int operand_num)
+{
+  unsigned_4* operand = NULL;
+  struct fifo_quadword* fifo_operand;
 
-         /* advance PC */
-         pke_pc_advance(me, 5);
-       }
-      else
-       {
-         /* need to wait for another word */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
-       }
-    }
-  else if(IS_PKE_CMD(cmd, MPG))
-    {
-      unsigned_4* last_mpg_word;
+  fifo_operand = pke_pc_fifo(me, operand_num, & operand);
 
-      /* map zero to max+1 */
-      if(num==0) num=0x100;
+  if(fifo_operand == NULL)
+    ASSERT(operand == NULL); /* pke_pc_fifo() ought leave it untouched */
 
-      /* check that FIFO has a few more words for MPG operand */
-      last_mpg_word = pke_pc_operand(me, num*2); /* num: number of 64-bit words */
-      if(last_mpg_word != NULL)
-       {
-         /* perform implied FLUSHE */
-         /* read VU status word */
-         unsigned_4 vu_stat;
-         sim_read(NULL,
-                  (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
-                  (void*) & vu_stat,
-                  sizeof(unsigned_4));
-         /* XXX: check RC */
-         
-         /* check if VBS bit is clear, i.e., VU is idle */
-         if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 0)
-           {
-             /* VU idle */
-             int i;
+  return operand;
+}
 
-             /* "transferring" operand */
-             PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
 
-             /* transfer VU instructions, one word per iteration */
-             for(i=0; i<num*2; i++)
-               {
-                 address_word vu_addr_base, vu_addr;
-                 address_word vutrack_addr_base, vutrack_addr;
-                 struct fifo_quadword* fq = pke_pc_fifo(me, num);
-                 unsigned_4* operand = pke_pc_operand(me, num);
-
-                 /* imm: in 64-bit units for MPG instruction */
-
-                 /* XXX: set NUM */
-
-                 /* VU*_MEM0 : instruction memory */
-                 vu_addr_base = (me->pke_number == 0) ?
-                   VU0_MEM0_WINDOW_START : VU0_MEM0_WINDOW_START;
-                 vu_addr = vu_addr_base + (imm*2) + i;
-
-                 /* VU*_MEM0_TRACK : source-addr tracking table */
-                 vutrack_addr_base = (me->pke_number == 0) ?
-                   VU0_MEM0_SRCADDR_START : VU1_MEM0_SRCADDR_START;
-                 vutrack_addr = vu_addr_base + (imm*2) + i;
-
-                 /* write data into VU memory */
-                 pke_track_write(me, operand, sizeof(unsigned_4),
-                                 vu_addr, fq->source_address);
-
-                 /* write srcaddr into VU srcaddr tracking table */
-                 sim_write(NULL,
-                           (SIM_ADDR) vutrack_addr,
-                           (void*) & fq->source_address,
-                           sizeof(unsigned_4));
-                 /* XXX: check RC */
-               } /* VU xfer loop */
-             
-             /* advance PC */
-             pke_pc_advance(me, 1 + num*2);
-           }
-         else
-           {
-             /* VU busy */
-             next_pps_state = PKE_REG_STAT_PPS_WAIT;
-             /* retry this instruction next clock */
-           }
-       } /* if FIFO full enough */
-      else
-       {
-         /* need to wait for another word */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
-       }
-    }
-  else if(IS_PKE_CMD(cmd, DIRECT) || IS_PKE_CMD(cmd, DIRECTHL)) /* treat identically */
-    {
-      /* check that FIFO has a few more words for DIRECT operand */
-      unsigned_4* last_direct_word;
 
-      /* map zero to max+1 */
-      if(imm==0) imm=0x10000;
 
-      last_direct_word = pke_pc_operand(me, imm*4); /* num: number of 128-bit words */
-      if(last_direct_word != NULL)
-       {
-         /* VU idle */
-         int i;
-         quadword fifo_data;
 
-         /* "transferring" operand */
-         PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
 
-         /* transfer GPUIF quadwords, one word per iteration */
-         for(i=0; i<imm*4; i++)
-           {
-             struct fifo_quadword* fq = pke_pc_fifo(me, num);
-             unsigned_4* operand = pke_pc_operand(me, num);
-             
-             /* collect word into quadword */
-             fifo_data[i%4] = *operand;
+/* Write a bunch of bytes into simulator memory.  Store the given source address into the
+   PKE sourceaddr tracking word. */
+int
+pke_track_write(struct pke_device* me, const void* src, int len, 
+               address_word dest, unsigned_4 sourceaddr)
+{
+  int rc;
+  unsigned_4 no_sourceaddr = 0;
 
-             /* write to GPUIF FIFO only with full word */
-             if(i%4 == 3)
-               {
-                 address_word gpuif_fifo = GPUIF_PATH2_FIFO_ADDR+(i/4);
-                 pke_track_write(me, fifo_data, sizeof(quadword),
-                                 (SIM_ADDR) gpuif_fifo, fq->source_address);
-                 /* XXX: check RC */
-               } /* write collected quadword */
+  /* write srcaddr into PKE srcaddr tracking */
+  sim_write(NULL,
+           (SIM_ADDR) (me->pke_number == 0) ? PKE0_SRCADDR : PKE1_SRCADDR,
+           (void*) & sourceaddr,
+           sizeof(unsigned_4));
+  
+  /* write bytes into simulator */
+  rc = sim_write(NULL,
+                (SIM_ADDR) dest,
+                (void*) src,
+                len);
+  
+  /* clear srcaddr from PKE srcaddr tracking */
+  sim_write(NULL,
+           (SIM_ADDR) (me->pke_number == 0) ? PKE0_SRCADDR : PKE1_SRCADDR,
+           (void*) & no_sourceaddr,
+           sizeof(unsigned_4));
 
-           } /* GPUIF xfer loop */
-             
-         /* advance PC */
-         pke_pc_advance(me, 1 + imm*4);
-       } /* if FIFO full enough */
-      else
-       {
-         /* need to wait for another word */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
-       }
+  return rc;
+}
+
+
+/* check for stall conditions on indicated devices (path* only on PKE1), do not change status
+   return 0 iff no stall */ 
+int
+pke_check_stall(struct pke_device* me, enum pke_check_target what)
+{
+  int any_stall = 0;
+
+  /* read VU status word - commonly used */
+  unsigned_4 vu_stat;
+  sim_read(NULL,
+          (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
+          (void*) & vu_stat,
+          sizeof(unsigned_4));
+
+  /* perform checks */
+  if(what == chk_vu)
+    {
+      /* check if VBS bit is set, i.e., VU is busy */
+      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 1)
+       any_stall = 1;
     }
-  else if(IS_PKE_CMD(cmd, UNPACK)) /* warning: monster complexity */
+  else if(what == chk_path1)
     {
-      short vn = BIT_MASK_GET(cmd, 2, 3);
-      short vl = BIT_MASK_GET(cmd, 0, 1);
-      short vnvl = BIT_MASK_GET(cmd, 0, 3);
-      int m = BIT_MASK_GET(cmd, 4, 4);
-      short cl = PKE_REG_MASK_GET(me, CYCLE, CL);
-      short wl = PKE_REG_MASK_GET(me, CYCLE, WL);
-      int n, num_operands;
-      unsigned_4* last_operand_word;
-
-      /* map zero to max+1 */
-      if(num==0) num=0x100;
-
-      /* compute PKEcode length, as given in CPU2 spec, v2.1 pg. 11 */
-      if(wl <= cl)
-       n = num;
-      else
-       n = cl * (num/wl) + PKE_LIMIT(num % wl, cl);
-      num_operands = (((sizeof(unsigned_4) >> vl) * (vn+1) * n)/sizeof(unsigned_4));
+      /* only valid on PKE1 */
+      /* check if VGW bit is set, i.e., PATH1 is busy */
+      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VGW_B, VU_REG_STAT_VGW_E) == 1)
+       any_stall = 1;
+    }
+  else
+    {
+      ASSERT(0); /* XXX: not done yet */
+    }
 
-      /* confirm that FIFO has enough words in it */
-      last_operand_word = pke_pc_operand(me, num_operands);
-      if(last_operand_word != NULL)
-       {
-         address_word vu_addr_base;
-         int operand_num, vector_num;
+  /* any stall reasons? */
+  return any_stall;
+}
 
-         /* "transferring" operand */
-         PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
 
-         /* XXX: don't check whether VU is idle?? */
+/* flip the DBF bit; recompute TOPS, ITOP & TOP */
+void
+pke_flip_dbf(struct pke_device* me)
+{
+  /* flip DBF */
+  PKE_REG_MASK_SET(me, DBF, DF,
+                  PKE_REG_MASK_GET(me, DBF, DF) ? 0 : 1);
+  PKE_REG_MASK_SET(me, STAT, DBF, PKE_REG_MASK_GET(me, DBF, DF));
+  /* compute new TOPS */
+  PKE_REG_MASK_SET(me, TOPS, TOPS,
+                  (PKE_REG_MASK_GET(me, BASE, BASE) +
+                   (PKE_REG_MASK_GET(me, DBF, DF) *
+                    PKE_REG_MASK_GET(me, OFST, OFFSET))));
+  /* compute new ITOP and TOP */
+  PKE_REG_MASK_SET(me, ITOP, ITOP,
+                  PKE_REG_MASK_GET(me, ITOPS, ITOPS));
+  PKE_REG_MASK_SET(me, TOP, TOP,
+                  PKE_REG_MASK_GET(me, TOPS, TOPS));
+}
 
-         if(me->pke_number == 0)
-           vu_addr_base = VU0_MEM1_WINDOW_START + BIT_MASK_GET(imm, 0, 9);
-         else
-           {
-             vu_addr_base = VU1_MEM1_WINDOW_START + BIT_MASK_GET(imm, 0, 9);
-             if(BIT_MASK_GET(imm, 15, 15)) /* fetch R flag from imm word */
-               vu_addr_base += PKE_REG_MASK_GET(me, TOPS, TOPS);
-           }
 
-         /* XXX: vu_addr overflow check */
 
-         /* transfer given number of vectors */
-         operand_num = 1; /* word index into instruction stream: 1..num_operands */
-         vector_num = 0;  /* vector number being processed: 0..num-1 */
-         while(operand_num <= num_operands)
-           {
-             quadword vu_old_data;
-             quadword vu_new_data;
-             quadword unpacked_data;
-             address_word vu_addr;
-             struct fifo_quadword* fq;
-             int i;
+/* PKEcode handler functions -- responsible for checking and
+   confirming old stall conditions, executing pkecode, updating PC and
+   status registers -- may assume being run on correct PKE unit */
+   
+void 
+pke_code_nop(struct pke_device* me, unsigned_4 pkecode)
+{
+  /* done */
+  pke_pc_advance(me, 1);
+  PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+}
 
-             /* XXX: set NUM */
 
-             /* compute VU destination address, as bytes in R5900 memory */
-             if(cl >= wl)
-               {
-                 /* map zero to max+1 */
-                 if(wl == 0) wl = 0x0100;
-                 vu_addr = vu_addr_base + 16*(cl*(vector_num/wl) + (vector_num%wl));
-               }
-             else
-               vu_addr = vu_addr_base + 16*vector_num;
+void
+pke_code_stcycl(struct pke_device* me, unsigned_4 pkecode)
+{
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+  /* copy immediate value into CYCLE reg */
+  me->regs[PKE_REG_CYCLE][0] = imm;
+  /* done */
+  pke_pc_advance(me, 1);
+  PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+}
 
-             /* read old VU data word at address */
-             sim_read(NULL, (SIM_ADDR) vu_addr, (void*) & vu_old_data, sizeof(vu_old_data));
 
-             /* Let sourceaddr track the first operand */
-             fq = pke_pc_fifo(me, operand_num);
+void
+pke_code_offset(struct pke_device* me, unsigned_4 pkecode)
+{
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+  /* copy 10 bits to OFFSET field */
+  PKE_REG_MASK_SET(me, OFST, OFFSET, BIT_MASK_GET(imm, 0, 9));
+  /* clear DBF bit */
+  PKE_REG_MASK_SET(me, DBF, DF, 0);
+  /* clear other DBF bit */
+  PKE_REG_MASK_SET(me, STAT, DBF, 0);
+  /* set TOPS = BASE */
+  PKE_REG_MASK_SET(me, TOPS, TOPS, PKE_REG_MASK_GET(me, BASE, BASE));
+  /* done */
+  pke_pc_advance(me, 1);
+  PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+}
 
-             /* For cyclic unpack, next operand quadword may come from instruction stream
-                or be zero. */
-             if((cl < wl) && ((vector_num % wl) >= cl)) /* wl != 0, set above */
-               {
-                 /* clear operand - used only in a "indeterminate" state */
-                 for(i = 0; i < 4; i++)
-                   unpacked_data[i] = 0;
-               }
-             else
-               {
-                 /* compute unpacked words from instruction stream */
-                 switch(vnvl)
-                   {
-                   case PKE_UNPACK_S_32:
-                   case PKE_UNPACK_V2_32:
-                   case PKE_UNPACK_V3_32:
-                   case PKE_UNPACK_V4_32:
-                     /* copy (vn+1) 32-bit values */
-                     for(i = 0; i < vn+1; i++)
-                       {
-                         unsigned_4* operand = pke_pc_operand(me, operand_num);
-                         unpacked_data[i] = *operand;
-                         operand_num ++;
-                       }
-                     break;
-                     
-                   case PKE_UNPACK_S_16:
-                   case PKE_UNPACK_V2_16:
-                   case PKE_UNPACK_V3_16:
-                   case PKE_UNPACK_V4_16:
-                     /* copy (vn+1) 16-bit values, packed two-per-word */
-                     for(i=0; i<vn+1; i+=2)
-                       {
-                         unsigned_4* operand = pke_pc_operand(me, operand_num);
-                         unpacked_data[i] = BIT_MASK_GET_SX(*operand, 0, 15, 31);
-                         unpacked_data[i+1] = BIT_MASK_GET_SX(*operand, 16, 31, 31);
-                         operand_num ++;
-                       }
-                     break;
-                     
-                   case PKE_UNPACK_S_8:
-                   case PKE_UNPACK_V2_8:
-                   case PKE_UNPACK_V3_8:
-                   case PKE_UNPACK_V4_8:
-                     /* copy (vn+1) 8-bit values, packed four-per-word */
-                     for(i=0; i<vn+1; i+=4)
-                       {
-                         unsigned_4* operand = pke_pc_operand(me, operand_num);
-                         unpacked_data[i] = BIT_MASK_GET_SX(*operand, 0, 7, 31);
-                         unpacked_data[i+1] = BIT_MASK_GET_SX(*operand, 8, 15, 31);
-                         unpacked_data[i+2] = BIT_MASK_GET_SX(*operand, 16, 23, 31);
-                         unpacked_data[i+3] = BIT_MASK_GET_SX(*operand, 24, 31, 31);
-                         operand_num ++;
-                       }
-                     break;
-                     
-                   case PKE_UNPACK_V4_5:
-                     /* copy four 1/5/5/5-bit values, packed into a sixteen-bit */
-                     for(i=0; i<vn+1; i+=4)
-                       {
-                         unsigned_4* operand = pke_pc_operand(me, operand_num);
-                         unpacked_data[i] = BIT_MASK_GET_SX(*operand, 0, 4, 31);
-                         unpacked_data[i+1] = BIT_MASK_GET_SX(*operand, 5, 9, 31);
-                         unpacked_data[i+2] = BIT_MASK_GET_SX(*operand, 10, 14, 31);
-                         unpacked_data[i+3] = BIT_MASK_GET_SX(*operand, 15, 15, 31);
-                         /* ignore other 16 bits in operand */
-                         operand_num ++;
-                       }
-                     break;
-                     
-                   default: /* bad UNPACK code */
-                     {
-                       /* XXX: how to handle? */
-                       /* set ER1 flag in STAT register */
-                       PKE_REG_MASK_SET(me, STAT, ER1, 1);
-                     }           
-                   }
-               }
-             
-             /* compute replacement word - function of vn, vl, mask */
-             if(m) /* use mask register? */
-               {
-                 /* compute index into mask register for this word */
-                 int mask_index = PKE_LIMIT(vector_num % wl, 3);  /* wl != 0, set above */
 
-                 for(i=0; i<3; i++) /* loop over columns */
-                   {
-                     int mask_op = PKE_MASKREG_GET(me, mask_index, i);
-                     unsigned_4* masked_value = NULL;
-                     unsigned_4 zero = 0;
-
-                     switch(mask_op)
-                       {
-                       case PKE_MASKREG_INPUT: 
-                         /* for vn == 0, all columns are copied from column 0 */
-                         if(vn == 0)
-                           masked_value = & unpacked_data[0];
-                         else if(i > vn)
-                           masked_value = & zero; /* XXX: what to put here? */
-                         else
-                           masked_value = & unpacked_data[i];
-                         break;
-
-                       case PKE_MASKREG_ROW: /* exploit R0..R3 contiguity */
-                         masked_value = & me->regs[PKE_REG_R0 + i][0];
-                         break;
-
-                       case PKE_MASKREG_COLUMN: /* exploit C0..C3 contiguity */
-                         masked_value = & me->regs[PKE_REG_C0 + PKE_LIMIT(vector_num,3)][0];
-                         break;
-
-                       case PKE_MASKREG_NOTHING:
-                         /* "write inhibit" by re-copying old data */
-                         masked_value = & vu_old_data[i];
-                         break;
-
-                       default:
-                         ASSERT(0);
-                         /* no other cases possible */
-                       }
-
-                     /* copy masked value for column */
-                     memcpy(& vu_new_data[i], masked_value, sizeof(unsigned_4));
-                   } /* loop over columns */
-               }
-             else
-               {
-                 /* no mask - just copy over entire unpacked quadword */
-                 memcpy(vu_new_data, unpacked_data, sizeof(unpacked_data));
-               }
+void
+pke_code_base(struct pke_device* me, unsigned_4 pkecode)
+{
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+  /* copy 10 bits to BASE field */
+  PKE_REG_MASK_SET(me, BASE, BASE, BIT_MASK_GET(imm, 0, 9));
+  /* clear DBF bit */
+  PKE_REG_MASK_SET(me, DBF, DF, 0);
+  /* clear other DBF bit */
+  PKE_REG_MASK_SET(me, STAT, DBF, 0);
+  /* set TOPS = BASE */
+  PKE_REG_MASK_SET(me, TOPS, TOPS, PKE_REG_MASK_GET(me, BASE, BASE));
+  /* done */
+  pke_pc_advance(me, 1);
+  PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+}
 
-             /* process STMOD register for accumulation operations */
-             switch(PKE_REG_MASK_GET(me, MODE, MDE))
-               {
-               case PKE_MODE_ADDROW: /* add row registers to output data */
-                 for(i=0; i<4; i++)
-                   /* exploit R0..R3 contiguity */
-                   vu_new_data[i] += me->regs[PKE_REG_R0 + i][0];
-                 break;
 
-               case PKE_MODE_ACCROW: /* add row registers to output data; accumulate */
-                 for(i=0; i<4; i++)
-                   {
-                     /* exploit R0..R3 contiguity */
-                     vu_new_data[i] += me->regs[PKE_REG_R0 + i][0];
-                     me->regs[PKE_REG_R0 + i][0] = vu_new_data[i];
-                   }
-                 break;
+void
+pke_code_itop(struct pke_device* me, unsigned_4 pkecode)
+{
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+  /* copy 10 bits to ITOPS field */
+  PKE_REG_MASK_SET(me, ITOPS, ITOPS, BIT_MASK_GET(imm, 0, 9));
+  /* done */
+  pke_pc_advance(me, 1);
+  PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+}
 
-               case PKE_MODE_INPUT: /* pass data through */
-               default:
-                 ;
-               }
 
-             /* write replacement word */
-             pke_track_write(me, vu_new_data, sizeof(vu_new_data),
-                             (SIM_ADDR) vu_addr, fq->source_address);
+void
+pke_code_stmod(struct pke_device* me, unsigned_4 pkecode)
+{
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+  /* copy 2 bits to MODE register */
+  PKE_REG_MASK_SET(me, MODE, MDE, BIT_MASK_GET(imm, 0, 2));
+  /* done */
+  pke_pc_advance(me, 1);
+  PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+}
 
-             /* next vector please */
-             vector_num ++;
-           } /* vector transfer loop */
-       } /* PKE FIFO full enough */
-      else
-       {
-         /* need to wait for another word */
-         next_pps_state = PKE_REG_STAT_PPS_WAIT;
-         /* retry this instruction next clock */
-       }
+
+void
+pke_code_mskpath3(struct pke_device* me, unsigned_4 pkecode)
+{
+  ASSERT(0);
+  /* XXX: cannot handle this one yet */
+}
+
+
+void
+pke_code_pkemark(struct pke_device* me, unsigned_4 pkecode)
+{
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+  /* copy 16 bits to MARK register */
+  PKE_REG_MASK_SET(me, MARK, MARK, BIT_MASK_GET(imm, 0, 15));
+  /* set MRK bit in STAT register - CPU2 v2.1 docs incorrect */
+  PKE_REG_MASK_SET(me, STAT, MRK, 1);
+  /* done */
+  pke_pc_advance(me, 1);
+  PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+}
+
+
+void
+pke_code_flushe(struct pke_device* me, unsigned_4 pkecode)
+{
+  /* compute next PEW bit */
+  if(pke_check_stall(me, chk_vu))
+    {
+      /* VU busy */
+      PKE_REG_MASK_SET(me, STAT, PEW, 1);
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_STALL);
+      /* try again next cycle */
     }
-  /* ... */
   else
     {
-      /* set ER1 flag in STAT register */
-      PKE_REG_MASK_SET(me, STAT, ER1, 1);
-      /* advance over faulty word */
+      /* VU idle */
+      PKE_REG_MASK_SET(me, STAT, PEW, 0);
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
       pke_pc_advance(me, 1);
     }
-
-  /* PKE is now idle or waiting */
-  PKE_REG_MASK_SET(me, STAT, PPS, next_pps_state);
 }
 
 
+void
+pke_code_flush(struct pke_device* me, unsigned_4 pkecode)
+{
+  int something_busy = 0;
+
+  /* compute next PEW, PGW bits */
+  if(pke_check_stall(me, chk_vu))
+    {
+      something_busy = 1;
+      PKE_REG_MASK_SET(me, STAT, PEW, 1);
+    }
+  else
+    PKE_REG_MASK_SET(me, STAT, PEW, 0);
 
 
+  if(pke_check_stall(me, chk_path1) ||
+     pke_check_stall(me, chk_path2))
+    {
+      something_busy = 1;
+      PKE_REG_MASK_SET(me, STAT, PGW, 1);
+    }
+  else
+    PKE_REG_MASK_SET(me, STAT, PGW, 0);
 
+  /* go or no go */
+  if(something_busy)
+    {
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* try again next cycle */
+    }
+  else
+    {
+      /* all idle */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 1);
+    }
+}
 
-/* advance the PC by given number of words; update STAT/FQC field */
 
 void
-pke_pc_advance(struct pke_device* me, int num_words)
+pke_code_flusha(struct pke_device* me, unsigned_4 pkecode)
 {
-  ASSERT(num_words > 0);
+  int something_busy = 0;
 
-  me->qw_pc += num_words;
-  /* handle overflow */
-  while(me->qw_pc >= 4)
+  /* compute next PEW, PGW bits */
+  if(pke_check_stall(me, chk_vu))
     {
-      me->qw_pc -= 4;
-      me->fifo_pc ++;
+      something_busy = 1;
+      PKE_REG_MASK_SET(me, STAT, PEW, 1);
     }
+  else
+    PKE_REG_MASK_SET(me, STAT, PEW, 0);
 
-  /* clear FQC if FIFO is now empty */ 
-  if(me->fifo_num_elements == me->fifo_pc)
+
+  if(pke_check_stall(me, chk_path1) ||
+     pke_check_stall(me, chk_path2) ||
+     pke_check_stall(me, chk_path3))
     {
-      PKE_REG_MASK_SET(me, STAT, FQC, 0);
+      something_busy = 1;
+      PKE_REG_MASK_SET(me, STAT, PGW, 1);
     }
+  else
+    PKE_REG_MASK_SET(me, STAT, PGW, 0);
 
+  if(something_busy)
+    {
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* try again next cycle */
+    }
+  else
+    {
+      /* all idle */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 1);
+    }
 }
 
 
+void
+pke_code_pkemscal(struct pke_device* me, unsigned_4 pkecode)
+{
+  /* compute next PEW bit */
+  if(pke_check_stall(me, chk_vu))
+    {
+      /* VU busy */
+      PKE_REG_MASK_SET(me, STAT, PEW, 1);
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_STALL);
+      /* try again next cycle */
+    }
+  else
+    {
+      unsigned_4 vu_pc;
+      int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+
+      /* VU idle */
+      PKE_REG_MASK_SET(me, STAT, PEW, 0);
+
+      /* flip DBF on PKE1 */
+      if(me->pke_number == 1)
+       pke_flip_dbf(me);
+
+      /* compute new PC for VU */
+      vu_pc = BIT_MASK_GET(imm, 0, 15); /* XXX: all bits significant? */
+      /* write new PC; callback function gets VU running */
+      sim_write(NULL,
+               (SIM_ADDR) (me->pke_number == 0 ? VU0_PC_START : VU1_PC_START),
+               (void*) & vu_pc,
+               sizeof(unsigned_4));
+
+      /* done */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 1);
+    }
+}
 
-/* Return pointer to given operand# in FIFO.  `word_num' starts at 1.
-   If FIFO is not full enough, return 0. */
 
-unsigned_4*
-pke_pc_operand(struct pke_device* me, int word_num)
+
+void
+pke_code_pkemscnt(struct pke_device* me, unsigned_4 pkecode)
 {
-  int new_qw_pc = 0;   
-  int new_fifo_pc;
-  unsigned_4* operand;
+  /* compute next PEW bit */
+  if(pke_check_stall(me, chk_vu))
+    {
+      /* VU busy */
+      PKE_REG_MASK_SET(me, STAT, PEW, 1);
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_STALL);
+      /* try again next cycle */
+    }
+  else
+    {
+      unsigned_4 vu_pc;
 
-  ASSERT(word_num > 0);
+      /* VU idle */
+      PKE_REG_MASK_SET(me, STAT, PEW, 0);
 
-  new_fifo_pc = me->fifo_pc;
-  new_qw_pc += me->qw_pc + word_num;
+      /* flip DBF on PKE1 */
+      if(me->pke_number == 1)
+       pke_flip_dbf(me);
 
-  /* handle overflow */
-  while(new_qw_pc >= 4)
+      /* read old PC */
+      sim_read(NULL,
+              (SIM_ADDR) (me->pke_number == 0 ? VU0_PC_START : VU1_PC_START),
+              (void*) & vu_pc,
+              sizeof(unsigned_4));
+
+      /* rewrite new PC; callback function gets VU running */
+      sim_write(NULL,
+               (SIM_ADDR) (me->pke_number == 0 ? VU0_PC_START : VU1_PC_START),
+               (void*) & vu_pc,
+               sizeof(unsigned_4));
+
+      /* done */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 1);
+    }
+}
+
+
+void
+pke_code_pkemscalf(struct pke_device* me, unsigned_4 pkecode)
+{
+  int something_busy = 0;
+
+  /* compute next PEW, PGW bits */
+  if(pke_check_stall(me, chk_vu))
     {
-      new_qw_pc -= 4;
-      new_fifo_pc ++;
+      something_busy = 1;
+      PKE_REG_MASK_SET(me, STAT, PEW, 1);
     }
+  else
+    PKE_REG_MASK_SET(me, STAT, PEW, 0);
 
-  /* not enough elements */
-  if(me->fifo_num_elements == me->fifo_pc)
-    operand = NULL;
+
+  if(pke_check_stall(me, chk_path1) ||
+     pke_check_stall(me, chk_path2) ||
+     pke_check_stall(me, chk_path3))
+    {
+      something_busy = 1;
+      PKE_REG_MASK_SET(me, STAT, PGW, 1);
+    }
   else
-    operand = & me->fifo[new_fifo_pc].data[new_qw_pc];
+    PKE_REG_MASK_SET(me, STAT, PGW, 0);
 
-  return operand;
+  /* go or no go */
+  if(something_busy)
+    {
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* try again next cycle */
+    }
+  else
+    {
+      unsigned_4 vu_pc;
+      int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+      
+      /* flip DBF on PKE1 */
+      if(me->pke_number == 1)
+       pke_flip_dbf(me);
+
+      /* compute new PC for VU */
+      vu_pc = BIT_MASK_GET(imm, 0, 15); /* XXX: all bits significant? */
+      /* write new PC; callback function gets VU running */
+      sim_write(NULL,
+               (SIM_ADDR) (me->pke_number == 0 ? VU0_PC_START : VU1_PC_START),
+               (void*) & vu_pc,
+               sizeof(unsigned_4));
+
+      /* done */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 1);
+    }
 }
 
 
+void
+pke_code_stmask(struct pke_device* me, unsigned_4 pkecode)
+{
+  /* check that FIFO has one more word for STMASK operand */
+  unsigned_4* mask;
+  
+  mask = pke_pc_operand(me, 1);
+  if(mask != NULL)
+    {
+      /* "transferring" operand */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
+      /* fill the register */
+      PKE_REG_MASK_SET(me, MASK, MASK, *mask);
+      /* done */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 1);
+    }
+  else
+    {
+      /* need to wait for another word */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* try again next cycle */
+    }
+}
 
-/* Return pointer to FIFO quadword containing given operand# in FIFO.
-   `word_num' starts at 1.  If FIFO is not full enough, return 0. */
 
-struct fifo_quadword*
-pke_pc_fifo(struct pke_device* me, int word_num)
+void
+pke_code_strow(struct pke_device* me, unsigned_4 pkecode)
 {
-  int new_qw_pc = 0;
-  int new_fifo_pc;
-  struct fifo_quadword* operand;
-
-  ASSERT(word_num > 0);
+  /* check that FIFO has four more words for STROW operand */
+  unsigned_4* last_op;
+  
+  last_op = pke_pc_operand(me, 4);
+  if(last_op != NULL)
+    {
+      /* "transferring" operand */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
+      
+      /* copy ROW registers: must all exist if 4th operand exists */
+      me->regs[PKE_REG_R0][0] = * pke_pc_operand(me, 1);
+      me->regs[PKE_REG_R1][0] = * pke_pc_operand(me, 2);
+      me->regs[PKE_REG_R2][0] = * pke_pc_operand(me, 3);
+      me->regs[PKE_REG_R3][0] = * pke_pc_operand(me, 4);
+      
+      /* done */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 5);
+    }
+  else
+    {
+      /* need to wait for another word */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* try again next cycle */
+    }
+}
 
-  new_fifo_pc = me->fifo_pc;
-  new_qw_pc += me->qw_pc + word_num;
 
-  /* handle overflow */
-  while(new_qw_pc >= 4)
+void
+pke_code_stcol(struct pke_device* me, unsigned_4 pkecode)
+{
+  /* check that FIFO has four more words for STCOL operand */
+  unsigned_4* last_op;
+  
+  last_op = pke_pc_operand(me, 4);
+  if(last_op != NULL)
     {
-      new_qw_pc -= 4;
-      new_fifo_pc ++;
+      /* "transferring" operand */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
+      
+      /* copy COL registers: must all exist if 4th operand exists */
+      me->regs[PKE_REG_C0][0] = * pke_pc_operand(me, 1);
+      me->regs[PKE_REG_C1][0] = * pke_pc_operand(me, 2);
+      me->regs[PKE_REG_C2][0] = * pke_pc_operand(me, 3);
+      me->regs[PKE_REG_C3][0] = * pke_pc_operand(me, 4);
+      
+      /* done */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 5);
     }
-
-  /* not enough elements */
-  if(me->fifo_num_elements == me->fifo_pc)
-    operand = NULL;
   else
-    operand = & me->fifo[new_fifo_pc];
+    {
+      /* need to wait for another word */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* try again next cycle */
+    }
+}
 
-  return operand;
+
+void
+pke_code_mpg(struct pke_device* me, unsigned_4 pkecode)
+{
+  unsigned_4* last_mpg_word;
+  int num = BIT_MASK_GET(pkecode, PKE_OPCODE_NUM_B, PKE_OPCODE_NUM_E);
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+
+  /* map zero to max+1 */
+  if(num==0) num=0x100;
+  
+  /* check that FIFO has a few more words for MPG operand */
+  last_mpg_word = pke_pc_operand(me, num*2); /* num: number of 64-bit words */
+  if(last_mpg_word != NULL)
+    {
+      /* perform implied FLUSHE */
+      /* read VU status word */
+      unsigned_4 vu_stat;
+      sim_read(NULL,
+              (SIM_ADDR) (me->pke_number == 0 ? VPE0_STAT : VPE1_STAT),
+              (void*) & vu_stat,
+              sizeof(unsigned_4));
+      
+      /* check if VBS bit is clear, i.e., VU is idle */
+      if(BIT_MASK_GET(vu_stat, VU_REG_STAT_VBS_B, VU_REG_STAT_VBS_E) == 0)
+       {
+         /* VU idle */
+         int i;
+         
+         /* "transferring" operand */
+         PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
+         
+         /* transfer VU instructions, one word per iteration */
+         for(i=0; i<num*2; i++)
+           {
+             address_word vu_addr_base, vu_addr;
+             address_word vutrack_addr_base, vutrack_addr;
+             unsigned_4* operand;
+             struct fifo_quadword* fq = pke_pc_fifo(me, num, & operand);
+             
+             /* imm: in 64-bit units for MPG instruction */
+             
+             /* XXX: set NUM */
+             
+             /* VU*_MEM0 : instruction memory */
+             vu_addr_base = (me->pke_number == 0) ?
+               VU0_MEM0_WINDOW_START : VU0_MEM0_WINDOW_START;
+             vu_addr = vu_addr_base + (imm*2) + i;
+             
+             /* VU*_MEM0_TRACK : source-addr tracking table */
+             vutrack_addr_base = (me->pke_number == 0) ?
+               VU0_MEM0_SRCADDR_START : VU1_MEM0_SRCADDR_START;
+             vutrack_addr = vu_addr_base + (imm*2) + i;
+             
+             /* write data into VU memory */
+             pke_track_write(me, operand, sizeof(unsigned_4),
+                             vu_addr, fq->source_address);
+             
+             /* write srcaddr into VU srcaddr tracking table */
+             sim_write(NULL,
+                       (SIM_ADDR) vutrack_addr,
+                       (void*) & fq->source_address,
+                       sizeof(unsigned_4));
+           } /* VU xfer loop */
+         
+         /* done */
+         PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+         pke_pc_advance(me, 1 + num*2);
+       }
+      else
+       {
+         /* VU busy */
+         PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_STALL);
+         /* retry this instruction next clock */
+       }
+    } /* if FIFO full enough */
+  else
+    {
+      /* need to wait for another word */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* retry this instruction next clock */
+    }
 }
 
 
+void
+pke_code_direct(struct pke_device* me, unsigned_4 pkecode)
+{
+  /* check that FIFO has a few more words for DIRECT operand */
+  unsigned_4* last_direct_word;
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+  int num = BIT_MASK_GET(pkecode, PKE_OPCODE_NUM_B, PKE_OPCODE_NUM_E);
+  
+  /* map zero to max+1 */
+  if(imm==0) imm=0x10000;
+  
+  last_direct_word = pke_pc_operand(me, imm*4); /* num: number of 128-bit words */
+  if(last_direct_word != NULL)
+    {
+      /* VU idle */
+      int i;
+      quadword fifo_data;
+      
+      /* "transferring" operand */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
+      
+      /* transfer GPUIF quadwords, one word per iteration */
+      for(i=0; i<imm*4; i++)
+       {
+         unsigned_4* operand;
+         struct fifo_quadword* fq = pke_pc_fifo(me, num, &operand);
+         
+         /* collect word into quadword */
+         fifo_data[i%4] = *operand;
+         
+         /* write to GPUIF FIFO only with full word */
+         if(i%4 == 3)
+           {
+             address_word gpuif_fifo = GPUIF_PATH2_FIFO_ADDR+(i/4);
+             pke_track_write(me, fifo_data, sizeof(quadword),
+                             (SIM_ADDR) gpuif_fifo, fq->source_address);
+           } /* write collected quadword */
+         
+       } /* GPUIF xfer loop */
+      
+      /* done */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, 1 + imm*4);
+    } /* if FIFO full enough */
+  else
+    {
+      /* need to wait for another word */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* retry this instruction next clock */
+    }
+}
 
-/* Write a bunch of bytes into simulator memory.  Store the given source address into the
-   PKE sourceaddr tracking word. */
-int
-pke_track_write(struct pke_device* me, const void* src, int len, 
-               address_word dest, unsigned_4 sourceaddr)
+
+void
+pke_code_directhl(struct pke_device* me, unsigned_4 pkecode)
 {
-  int rc;
-  unsigned_4 no_sourceaddr = 0;
+  /* treat the same as DIRECTH */
+  pke_code_direct(me, pkecode);
+}
 
-  /* write srcaddr into PKE srcaddr tracking */
-  sim_write(NULL,
-           (SIM_ADDR) (me->pke_number == 0) ? PKE0_SRCADDR : PKE1_SRCADDR,
-           (void*) & sourceaddr,
-           sizeof(unsigned_4));
+
+void
+pke_code_unpack(struct pke_device* me, unsigned_4 pkecode)
+{
+  int imm = BIT_MASK_GET(pkecode, PKE_OPCODE_IMM_B, PKE_OPCODE_IMM_E);
+  int cmd = BIT_MASK_GET(pkecode, PKE_OPCODE_CMD_B, PKE_OPCODE_CMD_E);
+  int num = BIT_MASK_GET(pkecode, PKE_OPCODE_NUM_B, PKE_OPCODE_NUM_E);
+
+  short vn = BIT_MASK_GET(cmd, 2, 3);
+  short vl = BIT_MASK_GET(cmd, 0, 1);
+  short vnvl = BIT_MASK_GET(cmd, 0, 3);
+  int m = BIT_MASK_GET(cmd, 4, 4);
+  short cl = PKE_REG_MASK_GET(me, CYCLE, CL);
+  short wl = PKE_REG_MASK_GET(me, CYCLE, WL);
+  int n, num_operands;
+  unsigned_4* last_operand_word;
   
-  /* write bytes into simulator */
-  rc = sim_write(NULL,
-                (SIM_ADDR) dest,
-                (void*) src,
-                len);
+  /* map zero to max+1 */
+  if(num==0) num=0x100;
   
-  /* clear srcaddr from PKE srcaddr tracking */
-  sim_write(NULL,
-           (SIM_ADDR) (me->pke_number == 0) ? PKE0_SRCADDR : PKE1_SRCADDR,
-           (void*) & no_sourceaddr,
-           sizeof(unsigned_4));
+  /* compute PKEcode length, as given in CPU2 spec, v2.1 pg. 11 */
+  if(wl <= cl)
+    n = num;
+  else
+    n = cl * (num/wl) + PKE_LIMIT(num % wl, cl);
+  num_operands = (((sizeof(unsigned_4) >> vl) * (vn+1) * n)/sizeof(unsigned_4));
+  
+  /* confirm that FIFO has enough words in it */
+  last_operand_word = pke_pc_operand(me, num_operands);
+  if(last_operand_word != NULL)
+    {
+      address_word vu_addr_base;
+      int operand_num, vector_num;
+      
+      /* "transferring" operand */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_XFER);
+      
+      /* don't check whether VU is idle */
+      
+      if(me->pke_number == 0)
+       vu_addr_base = VU0_MEM1_WINDOW_START + BIT_MASK_GET(imm, 0, 9);
+      else
+       {
+         vu_addr_base = VU1_MEM1_WINDOW_START + BIT_MASK_GET(imm, 0, 9);
+         if(BIT_MASK_GET(imm, 15, 15)) /* fetch R flag from imm word */
+           vu_addr_base += PKE_REG_MASK_GET(me, TOPS, TOPS);
+       }
+      
+      /* XXX: vu_addr overflow check */
+      
+      /* transfer given number of vectors */
+      operand_num = 1; /* word index into instruction stream: 1..num_operands */
+      vector_num = 0;  /* vector number being processed: 0..num-1 */
+      while(operand_num <= num_operands)
+       {
+         quadword vu_old_data;
+         quadword vu_new_data;
+         quadword unpacked_data;
+         address_word vu_addr;
+         struct fifo_quadword* fq;
+         int i;
+         
+         /* XXX: set NUM */
+         
+         /* compute VU destination address, as bytes in R5900 memory */
+         if(cl >= wl)
+           {
+             /* map zero to max+1 */
+             if(wl == 0) wl = 0x0100;
+             vu_addr = vu_addr_base + 16*(cl*(vector_num/wl) + (vector_num%wl));
+           }
+         else
+           vu_addr = vu_addr_base + 16*vector_num;
+         
+         /* read old VU data word at address */
+         sim_read(NULL, (SIM_ADDR) vu_addr, (void*) & vu_old_data, sizeof(vu_old_data));
+         
+         /* Let sourceaddr track the first operand */
+         fq = pke_pc_fifo(me, operand_num, NULL);
+         
+         /* For cyclic unpack, next operand quadword may come from instruction stream
+            or be zero. */
+         if((cl < wl) && ((vector_num % wl) >= cl)) /* wl != 0, set above */
+           {
+             /* clear operand - used only in a "indeterminate" state */
+             for(i = 0; i < 4; i++)
+               unpacked_data[i] = 0;
+           }
+         else
+           {
+             /* compute unpacked words from instruction stream */
+             switch(vnvl)
+               {
+               case PKE_UNPACK_S_32:
+               case PKE_UNPACK_V2_32:
+               case PKE_UNPACK_V3_32:
+               case PKE_UNPACK_V4_32:
+                 /* copy (vn+1) 32-bit values */
+                 for(i = 0; i < vn+1; i++)
+                   {
+                     unsigned_4* operand = pke_pc_operand(me, operand_num);
+                     unpacked_data[i] = *operand;
+                     operand_num ++;
+                   }
+                 break;
+                 
+               case PKE_UNPACK_S_16:
+               case PKE_UNPACK_V2_16:
+               case PKE_UNPACK_V3_16:
+               case PKE_UNPACK_V4_16:
+                 /* copy (vn+1) 16-bit values, packed two-per-word */
+                 for(i=0; i<vn+1; i+=2)
+                   {
+                     unsigned_4* operand = pke_pc_operand(me, operand_num);
+                     unpacked_data[i] = BIT_MASK_GET_SX(*operand, 0, 15, 31);
+                     unpacked_data[i+1] = BIT_MASK_GET_SX(*operand, 16, 31, 31);
+                     operand_num ++;
+                   }
+                 break;
+                 
+               case PKE_UNPACK_S_8:
+               case PKE_UNPACK_V2_8:
+               case PKE_UNPACK_V3_8:
+               case PKE_UNPACK_V4_8:
+                 /* copy (vn+1) 8-bit values, packed four-per-word */
+                 for(i=0; i<vn+1; i+=4)
+                   {
+                     unsigned_4* operand = pke_pc_operand(me, operand_num);
+                     unpacked_data[i] = BIT_MASK_GET_SX(*operand, 0, 7, 31);
+                     unpacked_data[i+1] = BIT_MASK_GET_SX(*operand, 8, 15, 31);
+                     unpacked_data[i+2] = BIT_MASK_GET_SX(*operand, 16, 23, 31);
+                     unpacked_data[i+3] = BIT_MASK_GET_SX(*operand, 24, 31, 31);
+                     operand_num ++;
+                   }
+                 break;
+                 
+               case PKE_UNPACK_V4_5:
+                 /* copy four 1/5/5/5-bit values, packed into a sixteen-bit */
+                 for(i=0; i<vn+1; i+=4)
+                   {
+                     unsigned_4* operand = pke_pc_operand(me, operand_num);
+                     unpacked_data[i] = BIT_MASK_GET_SX(*operand, 0, 4, 31);
+                     unpacked_data[i+1] = BIT_MASK_GET_SX(*operand, 5, 9, 31);
+                     unpacked_data[i+2] = BIT_MASK_GET_SX(*operand, 10, 14, 31);
+                     unpacked_data[i+3] = BIT_MASK_GET_SX(*operand, 15, 15, 31);
+                     operand_num ++;
+                   }
+                 break;
 
-  return rc;
+                 /* XXX: handle multiple rows of data in same word */ 
+                 /* clue: increment operand_num less frequently */
+
+               default: /* bad UNPACK code */
+                 {
+                   /* treat as illegal instruction */
+                   pke_code_error(me, pkecode);
+                   return;
+                 }               
+               }
+           }
+         
+         /* compute replacement word - function of vn, vl, mask */
+         if(m) /* use mask register? */
+           {
+             /* compute index into mask register for this word */
+             int mask_index = PKE_LIMIT(vector_num % wl, 3);  /* wl != 0, set above */
+             
+             for(i=0; i<3; i++) /* loop over columns */
+               {
+                 int mask_op = PKE_MASKREG_GET(me, mask_index, i);
+                 unsigned_4* masked_value = NULL;
+                 unsigned_4 zero = 0;
+                 
+                 switch(mask_op)
+                   {
+                   case PKE_MASKREG_INPUT: 
+                     /* for vn == 0, all columns are copied from column 0 */
+                     if(vn == 0)
+                       masked_value = & unpacked_data[0];
+                     else if(i > vn)
+                       masked_value = & zero; /* arbitrary data: undefined in spec */
+                     else
+                       masked_value = & unpacked_data[i];
+                     break;
+                     
+                   case PKE_MASKREG_ROW: /* exploit R0..R3 contiguity */
+                     masked_value = & me->regs[PKE_REG_R0 + i][0];
+                     break;
+                     
+                   case PKE_MASKREG_COLUMN: /* exploit C0..C3 contiguity */
+                     masked_value = & me->regs[PKE_REG_C0 + PKE_LIMIT(vector_num,3)][0];
+                     break;
+                     
+                   case PKE_MASKREG_NOTHING:
+                     /* "write inhibit" by re-copying old data */
+                     masked_value = & vu_old_data[i];
+                     break;
+                     
+                   default:
+                     ASSERT(0);
+                     /* no other cases possible */
+                   }
+                 
+                 /* copy masked value for column */
+                 memcpy(& vu_new_data[i], masked_value, sizeof(unsigned_4));
+               } /* loop over columns */
+           }
+         else
+           {
+             /* no mask - just copy over entire unpacked quadword */
+             memcpy(vu_new_data, unpacked_data, sizeof(unpacked_data));
+           }
+         
+         /* process STMOD register for accumulation operations */
+         switch(PKE_REG_MASK_GET(me, MODE, MDE))
+           {
+           case PKE_MODE_ADDROW: /* add row registers to output data */
+             for(i=0; i<4; i++)
+               /* exploit R0..R3 contiguity */
+               vu_new_data[i] += me->regs[PKE_REG_R0 + i][0];
+             break;
+
+           case PKE_MODE_ACCROW: /* add row registers to output data; accumulate */
+             for(i=0; i<4; i++)
+               {
+                 /* exploit R0..R3 contiguity */
+                 vu_new_data[i] += me->regs[PKE_REG_R0 + i][0];
+                 me->regs[PKE_REG_R0 + i][0] = vu_new_data[i];
+               }
+             break;
+
+           case PKE_MODE_INPUT: /* pass data through */
+           default:
+             ;
+           }
+
+         /* write replacement word */
+         pke_track_write(me, vu_new_data, sizeof(vu_new_data),
+                         (SIM_ADDR) vu_addr, fq->source_address);
+
+         /* next vector please */
+         vector_num ++;
+       } /* vector transfer loop */
+
+      /* done */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+      pke_pc_advance(me, num_operands);
+    } /* PKE FIFO full enough */
+  else
+    {
+      /* need to wait for another word */
+      PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_WAIT);
+      /* retry this instruction next clock */
+    }
+}
+
+
+void
+pke_code_error(struct pke_device* me, unsigned_4 pkecode)
+{
+  /* set ER1 flag in STAT register */
+  PKE_REG_MASK_SET(me, STAT, ER1, 1);
+  /* advance over faulty word */
+  PKE_REG_MASK_SET(me, STAT, PPS, PKE_REG_STAT_PPS_IDLE);
+  pke_pc_advance(me, 1);
 }
index 74b8bc77a095fc17d4b2a3a87ade754eb5b64946..c378c640eb0110292f76efe3cc2e6505f58f42b3 100644 (file)
@@ -7,10 +7,6 @@
 #include "sky-device.h"
 
 
-/* Debugguing PKE? */
-
-#define PKE_DEBUG 
-
 
 /* External functions */
 
@@ -163,10 +159,11 @@ typedef unsigned_4 quadword[4];
 #define PKE_REG_STAT_PPS_E 1
 #define PKE_REG_STAT_PPS_B 0
 
-#define PKE_REG_STAT_PPS_IDLE 0x00
-#define PKE_REG_STAT_PPS_WAIT 0x01
-#define PKE_REG_STAT_PPS_DECODE 0x02
-#define PKE_REG_STAT_PPS_XFER 0x03
+#define PKE_REG_STAT_PPS_IDLE 0x00 /* ready to execute next instruction */
+#define PKE_REG_STAT_PPS_WAIT 0x01 /* not enough words in FIFO */
+#define PKE_REG_STAT_PPS_DECODE 0x02 /* decoding instruction */
+#define PKE_REG_STAT_PPS_STALL 0x02 /* alias state for FLUSHE stall */
+#define PKE_REG_STAT_PPS_XFER 0x03 /* transferring instruction operands */
 
 /* DBF register */
 #define PKE_REG_DBF_DF_E 0
@@ -364,7 +361,7 @@ struct pke_device
   struct fifo_quadword* fifo;
   int fifo_num_elements; /* no. of quadwords occupied in FIFO */
   int fifo_buffer_size;  /* no. of quadwords of space in FIFO */
-  FILE* fifo_trace_file; /* or 0 for no trace */
+  FILE* fifo_trace_file; /* or 0 for no trace */ /* XXX: tracing not done */
   /* XXX: assumes FIFOs grow indefinately */
 
   /* PC */