cpu: implement an L-TAGE branch predictor
authorArthur Perais <arthur.perais@inria.fr>
Wed, 21 Dec 2016 21:25:13 +0000 (15:25 -0600)
committerArthur Perais <arthur.perais@inria.fr>
Wed, 21 Dec 2016 21:25:13 +0000 (15:25 -0600)
This patch implements an L-TAGE predictor, based on André Seznec's code
available from CBP-2
(http://hpca23.cse.tamu.edu/taco/camino/cbp2/cbp-src/realistic-seznec.h).

Signed-off-by Jason Lowe-Power <jason@lowepower.com>

src/cpu/pred/BranchPredictor.py
src/cpu/pred/SConscript
src/cpu/pred/ltage.cc [new file with mode: 0644]
src/cpu/pred/ltage.hh [new file with mode: 0644]

index 2d7d0d0e273b3978237e5ef2e5835be2ba045113..1b400c2b769b64a95a512845ce1e66b64afa9d69 100644 (file)
@@ -86,3 +86,19 @@ class BiModeBP(BranchPredictor):
     choicePredictorSize = Param.Unsigned(8192, "Size of choice predictor")
     choiceCtrBits = Param.Unsigned(2, "Bits of choice counters")
 
+class LTAGE(BranchPredictor):
+    type = 'LTAGE'
+    cxx_class = 'LTAGE'
+    cxx_header = "cpu/pred/ltage.hh"
+
+    logSizeBiMP = Param.Unsigned(14, "Log size of Bimodal predictor in bits")
+    logSizeTagTables = Param.Unsigned(11, "Log size of tag table in LTAGE")
+    logSizeLoopPred = Param.Unsigned(8, "Log size of the loop predictor")
+    nHistoryTables = Param.Unsigned(12, "Number of history tables")
+    tagTableCounterBits = Param.Unsigned(3, "Number of tag table counter bits")
+    histBufferSize = Param.Unsigned(2097152,
+            "A large number to track all branch histories(2MEntries default)")
+    minHist = Param.Unsigned(4, "Minimum history size of LTAGE")
+    maxHist = Param.Unsigned(640, "Maximum history size of LTAGE")
+    minTagWidth = Param.Unsigned(7, "Minimum tag size in tag tables")
+
index dca5e8d882e316ef71f2df4dde6e8e6122e3d4fd..1cdf7bbd7a3534bf6b7ca7ac41841e9ebea84177 100644 (file)
@@ -43,6 +43,7 @@ Source('indirect.cc')
 Source('ras.cc')
 Source('tournament.cc')
 Source ('bi_mode.cc')
+Source('ltage.cc')
 DebugFlag('FreeList')
 DebugFlag('Branch')
 DebugFlag('LTage')
diff --git a/src/cpu/pred/ltage.cc b/src/cpu/pred/ltage.cc
new file mode 100644 (file)
index 0000000..84fc9a4
--- /dev/null
@@ -0,0 +1,746 @@
+/*
+ * Copyright (c) 2014 The University of Wisconsin
+ *
+ * Copyright (c) 2006 INRIA (Institut National de Recherche en
+ * Informatique et en Automatique  / French National Research Institute
+ * for Computer Science and Applied Mathematics)
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Vignyan Reddy, Dibakar Gope and Arthur Perais,
+ * from André Seznec's code.
+ */
+
+/* @file
+ * Implementation of a L-TAGE branch predictor
+ */
+
+#include "cpu/pred/ltage.hh"
+
+#include "base/intmath.hh"
+#include "base/misc.hh"
+#include "base/random.hh"
+#include "base/trace.hh"
+#include "debug/Fetch.hh"
+#include "debug/LTage.hh"
+
+LTAGE::LTAGE(const LTAGEParams *params)
+  : BPredUnit(params),
+    logSizeBiMP(params->logSizeBiMP),
+    logSizeTagTables(params->logSizeTagTables),
+    logSizeLoopPred(params->logSizeLoopPred),
+    nHistoryTables(params->nHistoryTables),
+    tagTableCounterBits(params->tagTableCounterBits),
+    histBufferSize(params->histBufferSize),
+    minHist(params->minHist),
+    maxHist(params->maxHist),
+    minTagWidth(params->minTagWidth),
+    threadHistory(params->numThreads)
+{
+    assert(params->histBufferSize > params->maxHist * 2);
+    useAltPredForNewlyAllocated = 0;
+    logTick = 19;
+    tCounter = ULL(1) << (logTick - 1);
+
+    for (auto& history : threadHistory) {
+        history.pathHist = 0;
+        history.globalHistory = new uint8_t[histBufferSize];
+        history.gHist = history.globalHistory;
+        memset(history.gHist, 0, histBufferSize);
+        history.ptGhist = 0;
+    }
+
+    histLengths = new int [nHistoryTables+1];
+    histLengths[1] = minHist;
+    histLengths[nHistoryTables] = maxHist;
+
+    for (int i = 2; i <= nHistoryTables; i++) {
+        histLengths[i] = (int) (((double) minHist *
+                    pow ((double) (maxHist) / (double) minHist,
+                        (double) (i - 1) / (double) ((nHistoryTables- 1))))
+                    + 0.5);
+    }
+
+    tagWidths[1] = minTagWidth;
+    tagWidths[2] = minTagWidth;
+    tagWidths[3] = minTagWidth + 1;
+    tagWidths[4] = minTagWidth + 1;
+    tagWidths[5] = minTagWidth + 2;
+    tagWidths[6] = minTagWidth + 3;
+    tagWidths[7] = minTagWidth + 4;
+    tagWidths[8] = minTagWidth + 5;
+    tagWidths[9] = minTagWidth + 5;
+    tagWidths[10] = minTagWidth + 6;
+    tagWidths[11] = minTagWidth + 7;
+    tagWidths[12] = minTagWidth + 8;
+
+    for (int i = 1; i <= 2; i++)
+        tagTableSizes[i] = logSizeTagTables - 1;
+    for (int i = 3; i <= 6; i++)
+        tagTableSizes[i] = logSizeTagTables;
+    for (int i = 7; i <= 10; i++)
+        tagTableSizes[i] = logSizeTagTables - 1;
+    for (int i = 11; i <= 12; i++)
+        tagTableSizes[i] = logSizeTagTables - 2;
+
+    for (auto& history : threadHistory) {
+        history.computeIndices = new FoldedHistory[nHistoryTables+1];
+        history.computeTags[0] = new FoldedHistory[nHistoryTables+1];
+        history.computeTags[1] = new FoldedHistory[nHistoryTables+1];
+
+        for (int i = 1; i <= nHistoryTables; i++) {
+            history.computeIndices[i].init(histLengths[i], (tagTableSizes[i]));
+            history.computeTags[0][i].init(
+                history.computeIndices[i].origLength, tagWidths[i]);
+            history.computeTags[1][i].init(
+                history.computeIndices[i].origLength, tagWidths[i] - 1);
+            DPRINTF(LTage, "HistLength:%d, TTSize:%d, TTTWidth:%d\n",
+                    histLengths[i], tagTableSizes[i], tagWidths[i]);
+        }
+    }
+
+    btable = new BimodalEntry[ULL(1) << logSizeBiMP];
+    ltable = new LoopEntry[ULL(1) << logSizeLoopPred];
+    gtable = new TageEntry*[nHistoryTables + 1];
+    for (int i = 1; i <= nHistoryTables; i++) {
+        gtable[i] = new TageEntry[1<<(tagTableSizes[i])];
+    }
+
+    tableIndices = new int [nHistoryTables+1];
+    tableTags = new int [nHistoryTables+1];
+
+    loopUseCounter = 0;
+}
+
+int
+LTAGE::bindex(Addr pc_in) const
+{
+    return ((pc_in) & ((ULL(1) << (logSizeBiMP)) - 1));
+}
+
+int
+LTAGE::lindex(Addr pc_in) const
+{
+    return (((pc_in) & ((ULL(1) << (logSizeLoopPred - 2)) - 1)) << 2);
+}
+
+int
+LTAGE::F(int A, int size, int bank) const
+{
+    int A1, A2;
+
+    A = A & ((ULL(1) << size) - 1);
+    A1 = (A & ((ULL(1) << tagTableSizes[bank]) - 1));
+    A2 = (A >> tagTableSizes[bank]);
+    A2 = ((A2 << bank) & ((ULL(1) << tagTableSizes[bank]) - 1))
+       + (A2 >> (tagTableSizes[bank] - bank));
+    A = A1 ^ A2;
+    A = ((A << bank) & ((ULL(1) << tagTableSizes[bank]) - 1))
+      + (A >> (tagTableSizes[bank] - bank));
+    return (A);
+}
+
+
+// gindex computes a full hash of pc, ghist and pathHist
+int
+LTAGE::gindex(ThreadID tid, Addr pc, int bank) const
+{
+    int index;
+    int hlen = (histLengths[bank] > 16) ? 16 : histLengths[bank];
+    index =
+        (pc) ^ ((pc) >> ((int) abs(tagTableSizes[bank] - bank) + 1)) ^
+        threadHistory[tid].computeIndices[bank].comp ^
+        F(threadHistory[tid].pathHist, hlen, bank);
+
+    return (index & ((ULL(1) << (tagTableSizes[bank])) - 1));
+}
+
+
+// Tag computation
+uint16_t
+LTAGE::gtag(ThreadID tid, Addr pc, int bank) const
+{
+    int tag = (pc) ^ threadHistory[tid].computeTags[0][bank].comp
+                   ^ (threadHistory[tid].computeTags[1][bank].comp << 1);
+
+    return (tag & ((ULL(1) << tagWidths[bank]) - 1));
+}
+
+
+// Up-down saturating counter
+void
+LTAGE::ctrUpdate(int8_t & ctr, bool taken, int nbits)
+{
+    assert(nbits <= sizeof(int8_t) << 3);
+    if (taken) {
+        if (ctr < ((1 << (nbits - 1)) - 1))
+            ctr++;
+    } else {
+        if (ctr > -(1 << (nbits - 1)))
+            ctr--;
+    }
+}
+
+// Bimodal prediction
+bool
+LTAGE::getBimodePred(Addr pc, BranchInfo* bi) const
+{
+    return (btable[bi->bimodalIndex].pred > 0);
+}
+
+
+// Update the bimodal predictor: a hysteresis bit is shared among 4 prediction
+// bits
+void
+LTAGE::baseUpdate(Addr pc, bool taken, BranchInfo* bi)
+{
+    int inter = (btable[bi->bimodalIndex].pred << 1)
+              + btable[bi->bimodalIndex ].hyst;
+    if (taken) {
+        if (inter < 3)
+            inter++;
+    } else if (inter > 0) {
+        inter--;
+    }
+    btable[bi->bimodalIndex].pred = inter >> 1;
+    btable[bi->bimodalIndex].hyst = (inter & 1);
+    DPRINTF(LTage, "Updating branch %lx, pred:%d, hyst:%d\n",
+            pc, btable[bi->bimodalIndex].pred,btable[bi->bimodalIndex].hyst);
+}
+
+
+//loop prediction: only used if high confidence
+bool
+LTAGE::getLoop(Addr pc, BranchInfo* bi) const
+{
+    bi->loopHit = -1;
+    bi->loopPredValid = false;
+    bi->loopIndex = lindex(pc);
+    bi->loopTag = ((pc) >> (logSizeLoopPred - 2));
+
+    for (int i = 0; i < 4; i++) {
+        if (ltable[bi->loopIndex + i].tag == bi->loopTag) {
+            bi->loopHit = i;
+            bi->loopPredValid = (ltable[bi->loopIndex + i].confidence >= 3);
+            bi->currentIter = ltable[bi->loopIndex + i].currentIterSpec;
+            if (ltable[bi->loopIndex + i].currentIterSpec + 1 ==
+                ltable[bi->loopIndex + i].numIter) {
+                return !(ltable[bi->loopIndex + i].dir);
+            }else {
+                return (ltable[bi->loopIndex + i].dir);
+            }
+        }
+    }
+    return false;
+}
+
+void
+LTAGE::specLoopUpdate(Addr pc, bool taken, BranchInfo* bi)
+{
+    if (bi->loopHit>=0) {
+        int index = lindex(pc);
+        if (taken != ltable[index].dir) {
+            ltable[index].currentIterSpec = 0;
+        } else {
+            ltable[index].currentIterSpec++;
+        }
+    }
+}
+
+void
+LTAGE::loopUpdate(Addr pc, bool taken, BranchInfo* bi)
+{
+    int idx = bi->loopIndex + bi->loopHit;
+    if (bi->loopHit >= 0) {
+        //already a hit
+        if (bi->loopPredValid) {
+            if (taken != bi->loopPred) {
+                // free the entry
+                ltable[idx].numIter = 0;
+                ltable[idx].age = 0;
+                ltable[idx].confidence = 0;
+                ltable[idx].currentIter = 0;
+                return;
+            } else if (bi->loopPred != bi->tagePred) {
+                DPRINTF(LTage, "Loop Prediction success:%lx\n",pc);
+                if (ltable[idx].age < 7)
+                    ltable[idx].age++;
+            }
+        }
+
+        ltable[idx].currentIter++;
+        if (ltable[idx].currentIter > ltable[idx].numIter) {
+            ltable[idx].confidence = 0;
+            if (ltable[idx].numIter != 0) {
+                // free the entry
+                ltable[idx].numIter = 0;
+                ltable[idx].age = 0;
+                ltable[idx].confidence = 0;
+            }
+        }
+
+        if (taken != ltable[idx].dir) {
+            if (ltable[idx].currentIter == ltable[idx].numIter) {
+                DPRINTF(LTage, "Loop End predicted successfully:%lx\n", pc);
+
+                if (ltable[idx].confidence < 7) {
+                    ltable[idx].confidence++;
+                }
+                //just do not predict when the loop count is 1 or 2
+                if (ltable[idx].numIter < 3) {
+                    // free the entry
+                    ltable[idx].dir = taken;
+                    ltable[idx].numIter = 0;
+                    ltable[idx].age = 0;
+                    ltable[idx].confidence = 0;
+                }
+            } else {
+                DPRINTF(LTage, "Loop End predicted incorrectly:%lx\n", pc);
+                if (ltable[idx].numIter == 0) {
+                    // first complete nest;
+                    ltable[idx].confidence = 0;
+                    ltable[idx].numIter = ltable[idx].currentIter;
+                } else {
+                    //not the same number of iterations as last time: free the
+                    //entry
+                    ltable[idx].numIter = 0;
+                    ltable[idx].age = 0;
+                    ltable[idx].confidence = 0;
+                }
+            }
+            ltable[idx].currentIter = 0;
+        }
+
+    } else if (taken) {
+        //try to allocate an entry on taken branch
+        int nrand = random_mt.random<int>();
+        for (int i = 0; i < 4; i++) {
+            int loop_hit = (nrand + i) & 3;
+            idx = bi->loopIndex + loop_hit;
+            if (ltable[idx].age == 0) {
+                DPRINTF(LTage, "Allocating loop pred entry for branch %lx\n",
+                        pc);
+                ltable[idx].dir = !taken;
+                ltable[idx].tag = bi->loopTag;
+                ltable[idx].numIter = 0;
+                ltable[idx].age = 7;
+                ltable[idx].confidence = 0;
+                ltable[idx].currentIter = 1;
+                break;
+
+            }
+            else
+                ltable[idx].age--;
+        }
+    }
+
+}
+
+// shifting the global history:  we manage the history in a big table in order
+// to reduce simulation time
+void
+LTAGE::updateGHist(uint8_t * &h, bool dir, uint8_t * tab, int &pt)
+{
+    if (pt == 0) {
+        DPRINTF(LTage, "Rolling over the histories\n");
+         // Copy beginning of globalHistoryBuffer to end, such that
+         // the last maxHist outcomes are still reachable
+         // through pt[0 .. maxHist - 1].
+         for (int i = 0; i < maxHist; i++)
+             tab[histBufferSize - maxHist + i] = tab[i];
+         pt =  histBufferSize - maxHist;
+         h = &tab[pt];
+    }
+    pt--;
+    h--;
+    h[0] = (dir) ? 1 : 0;
+}
+
+// Get GHR for hashing indirect predictor
+// Build history backwards from pointer in
+// bp_history.
+unsigned
+LTAGE::getGHR(ThreadID tid, void *bp_history) const
+{
+    BranchInfo* bi = static_cast<BranchInfo*>(bp_history);
+    unsigned val = 0;
+    for (unsigned i = 0; i < 32; i++) {
+        // Make sure we don't go out of bounds
+        int gh_offset = bi->ptGhist + i;
+        assert(&(threadHistory[tid].globalHistory[gh_offset]) <
+               threadHistory[tid].globalHistory + histBufferSize);
+        val |= ((threadHistory[tid].globalHistory[gh_offset] & 0x1) << i);
+    }
+
+    return val;
+}
+
+//prediction
+bool
+LTAGE::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b)
+{
+    BranchInfo *bi = new BranchInfo(nHistoryTables+1);
+    b = (void*)(bi);
+    Addr pc = branch_pc;
+    bool pred_taken = true;
+    bi->loopHit = -1;
+
+    if (cond_branch) {
+        // TAGE prediction
+
+        // computes the table addresses and the partial tags
+        for (int i = 1; i <= nHistoryTables; i++) {
+            tableIndices[i] = gindex(tid, pc, i);
+            bi->tableIndices[i] = tableIndices[i];
+            tableTags[i] = gtag(tid, pc, i);
+            bi->tableTags[i] = tableTags[i];
+        }
+
+        bi->bimodalIndex = bindex(pc);
+
+        bi->hitBank = 0;
+        bi->altBank = 0;
+        //Look for the bank with longest matching history
+        for (int i = nHistoryTables; i > 0; i--) {
+            if (gtable[i][tableIndices[i]].tag == tableTags[i]) {
+                bi->hitBank = i;
+                bi->hitBankIndex = tableIndices[bi->hitBank];
+                break;
+            }
+        }
+        //Look for the alternate bank
+        for (int i = bi->hitBank - 1; i > 0; i--) {
+            if (gtable[i][tableIndices[i]].tag == tableTags[i]) {
+                bi->altBank = i;
+                bi->altBankIndex = tableIndices[bi->altBank];
+                break;
+            }
+        }
+        //computes the prediction and the alternate prediction
+        if (bi->hitBank > 0) {
+            if (bi->altBank > 0) {
+                bi->altTaken =
+                    gtable[bi->altBank][tableIndices[bi->altBank]].ctr >= 0;
+            }else {
+                bi->altTaken = getBimodePred(pc, bi);
+            }
+
+            bi->longestMatchPred =
+                gtable[bi->hitBank][tableIndices[bi->hitBank]].ctr >= 0;
+            bi->pseudoNewAlloc =
+                abs(2 * gtable[bi->hitBank][bi->hitBankIndex].ctr + 1) <= 1;
+
+            //if the entry is recognized as a newly allocated entry and
+            //useAltPredForNewlyAllocated is positive use the alternate
+            //prediction
+            if ((useAltPredForNewlyAllocated < 0)
+                   || abs(2 *
+                   gtable[bi->hitBank][tableIndices[bi->hitBank]].ctr + 1) > 1)
+                bi->tagePred = bi->longestMatchPred;
+            else
+                bi->tagePred = bi->altTaken;
+        } else {
+            bi->altTaken = getBimodePred(pc, bi);
+            bi->tagePred = bi->altTaken;
+            bi->longestMatchPred = bi->altTaken;
+        }
+        //end TAGE prediction
+
+        bi->loopPred = getLoop(pc, bi);        // loop prediction
+
+        pred_taken = (((loopUseCounter >= 0) && bi->loopPredValid)) ?
+                     (bi->loopPred): (bi->tagePred);
+        DPRINTF(LTage, "Predict for %lx: taken?:%d, loopTaken?:%d, "
+                "loopValid?:%d, loopUseCounter:%d, tagePred:%d, altPred:%d\n",
+                branch_pc, pred_taken, bi->loopPred, bi->loopPredValid,
+                loopUseCounter, bi->tagePred, bi->altTaken);
+    }
+    bi->branchPC = branch_pc;
+    bi->condBranch = cond_branch;
+    specLoopUpdate(branch_pc, pred_taken, bi);
+    return pred_taken;
+}
+
+// PREDICTOR UPDATE
+void
+LTAGE::update(ThreadID tid, Addr branch_pc, bool taken, void* bp_history,
+              bool squashed)
+{
+    assert(bp_history);
+
+    BranchInfo *bi = static_cast<BranchInfo*>(bp_history);
+
+    if (squashed) {
+        // This restores the global history, then update it
+        // and recomputes the folded histories.
+        squash(tid, taken, bp_history);
+        return;
+    }
+
+    int nrand  = random_mt.random<int>(0,3);
+    Addr pc = branch_pc;
+    if (bi->condBranch) {
+        DPRINTF(LTage, "Updating tables for branch:%lx; taken?:%d\n",
+                branch_pc, taken);
+        // first update the loop predictor
+        loopUpdate(pc, taken, bi);
+
+        if (bi->loopPredValid) {
+            if (bi->tagePred != bi->loopPred) {
+                ctrUpdate(loopUseCounter, (bi->loopPred== taken), 7);
+            }
+        }
+
+        // TAGE UPDATE
+        // try to allocate a  new entries only if prediction was wrong
+        bool longest_match_pred = false;
+        bool alloc = (bi->tagePred != taken) && (bi->hitBank < nHistoryTables);
+        if (bi->hitBank > 0) {
+            // Manage the selection between longest matching and alternate
+            // matching for "pseudo"-newly allocated longest matching entry
+             longest_match_pred = bi->longestMatchPred;
+            bool PseudoNewAlloc = bi->pseudoNewAlloc;
+            // an entry is considered as newly allocated if its prediction
+            // counter is weak
+            if (PseudoNewAlloc) {
+                if (longest_match_pred == taken) {
+                    alloc = false;
+                }
+                // if it was delivering the correct prediction, no need to
+                // allocate new entry even if the overall prediction was false
+                if (longest_match_pred != bi->altTaken) {
+                    ctrUpdate(useAltPredForNewlyAllocated,
+                         bi->altTaken == taken, 4);
+                }
+            }
+        }
+
+        if (alloc) {
+            // is there some "unuseful" entry to allocate
+            int8_t min = 1;
+            for (int i = nHistoryTables; i > bi->hitBank; i--) {
+                if (gtable[i][bi->tableIndices[i]].u < min) {
+                    min = gtable[i][bi->tableIndices[i]].u;
+                }
+            }
+
+            // we allocate an entry with a longer history
+            // to  avoid ping-pong, we do not choose systematically the next
+            // entry, but among the 3 next entries
+            int Y = nrand &
+                ((ULL(1) << (nHistoryTables - bi->hitBank - 1)) - 1);
+            int X = bi->hitBank + 1;
+            if (Y & 1) {
+                X++;
+                if (Y & 2)
+                    X++;
+            }
+            // No entry available, forces one to be available
+            if (min > 0) {
+                gtable[X][bi->tableIndices[X]].u = 0;
+            }
+
+
+            //Allocate only  one entry
+            for (int i = X; i <= nHistoryTables; i++) {
+                if ((gtable[i][bi->tableIndices[i]].u == 0)) {
+                    gtable[i][bi->tableIndices[i]].tag = bi->tableTags[i];
+                    gtable[i][bi->tableIndices[i]].ctr = (taken) ? 0 : -1;
+                    gtable[i][bi->tableIndices[i]].u = 0; //?
+                }
+            }
+        }
+        //periodic reset of u: reset is not complete but bit by bit
+        tCounter++;
+        if ((tCounter & ((ULL(1) << logTick) - 1)) == 0) {
+            // reset least significant bit
+            // most significant bit becomes least significant bit
+            for (int i = 1; i <= nHistoryTables; i++) {
+                for (int j = 0; j < (ULL(1) << tagTableSizes[i]); j++) {
+                    gtable[i][j].u = gtable[i][j].u >> 1;
+                }
+            }
+        }
+
+        if (bi->hitBank > 0) {
+            DPRINTF(LTage, "Updating tag table entry (%d,%d) for branch %lx\n",
+                    bi->hitBank, bi->hitBankIndex, branch_pc);
+            ctrUpdate(gtable[bi->hitBank][bi->hitBankIndex].ctr, taken,
+                      tagTableCounterBits);
+            // if the provider entry is not certified to be useful also update
+            // the alternate prediction
+            if (gtable[bi->hitBank][bi->hitBankIndex].u == 0) {
+                if (bi->altBank > 0) {
+                    ctrUpdate(gtable[bi->altBank][bi->altBankIndex].ctr, taken,
+                              tagTableCounterBits);
+                    DPRINTF(LTage, "Updating tag table entry (%d,%d) for"
+                            " branch %lx\n", bi->hitBank, bi->hitBankIndex,
+                            branch_pc);
+                }
+                if (bi->altBank == 0) {
+                    baseUpdate(pc, taken, bi);
+                }
+            }
+
+            // update the u counter
+            if (longest_match_pred != bi->altTaken) {
+                if (longest_match_pred == taken) {
+                    if (gtable[bi->hitBank][bi->hitBankIndex].u < 1) {
+                        gtable[bi->hitBank][bi->hitBankIndex].u++;
+                    }
+                }
+            }
+        } else {
+            baseUpdate(pc, taken, bi);
+        }
+
+        //END PREDICTOR UPDATE
+    }
+    if (!squashed) {
+        delete bi;
+    }
+}
+
+void
+LTAGE::updateHistories(ThreadID tid, Addr branch_pc, bool taken, void* b)
+{
+    BranchInfo* bi = (BranchInfo*)(b);
+    ThreadHistory& tHist = threadHistory[tid];
+    //  UPDATE HISTORIES
+    bool pathbit = ((branch_pc) & 1);
+    //on a squash, return pointers to this and recompute indices.
+    //update user history
+    updateGHist(tHist.gHist, taken, tHist.globalHistory, tHist.ptGhist);
+    tHist.pathHist = (tHist.pathHist << 1) + pathbit;
+    tHist.pathHist = (tHist.pathHist & ((ULL(1) << 16) - 1));
+
+    bi->ptGhist = tHist.ptGhist;
+    bi->pathHist = tHist.pathHist;
+    //prepare next index and tag computations for user branchs
+    for (int i = 1; i <= nHistoryTables; i++)
+    {
+        bi->ci[i]  = tHist.computeIndices[i].comp;
+        bi->ct0[i] = tHist.computeTags[0][i].comp;
+        bi->ct1[i] = tHist.computeTags[1][i].comp;
+        tHist.computeIndices[i].update(tHist.gHist);
+        tHist.computeTags[0][i].update(tHist.gHist);
+        tHist.computeTags[1][i].update(tHist.gHist);
+    }
+    DPRINTF(LTage, "Updating global histories with branch:%lx; taken?:%d, "
+            "path Hist: %x; pointer:%d\n", branch_pc, taken, tHist.pathHist,
+            tHist.ptGhist);
+}
+
+void
+LTAGE::squash(ThreadID tid, bool taken, void *bp_history)
+{
+    BranchInfo* bi = (BranchInfo*)(bp_history);
+    ThreadHistory& tHist = threadHistory[tid];
+    DPRINTF(LTage, "Restoring branch info: %lx; taken? %d; PathHistory:%x, "
+            "pointer:%d\n", bi->branchPC,taken, bi->pathHist, bi->ptGhist);
+    tHist.pathHist = bi->pathHist;
+    tHist.ptGhist = bi->ptGhist;
+    tHist.gHist = &(tHist.globalHistory[tHist.ptGhist]);
+    tHist.gHist[0] = (taken ? 1 : 0);
+    for (int i = 1; i <= nHistoryTables; i++) {
+        tHist.computeIndices[i].comp = bi->ci[i];
+        tHist.computeTags[0][i].comp = bi->ct0[i];
+        tHist.computeTags[1][i].comp = bi->ct1[i];
+        tHist.computeIndices[i].update(tHist.gHist);
+        tHist.computeTags[0][i].update(tHist.gHist);
+        tHist.computeTags[1][i].update(tHist.gHist);
+    }
+
+    if (bi->condBranch) {
+        if (bi->loopHit >= 0) {
+            int idx = bi->loopIndex + bi->loopHit;
+            ltable[idx].currentIterSpec = bi->currentIter;
+        }
+    }
+
+}
+
+void
+LTAGE::squash(ThreadID tid, void *bp_history)
+{
+    BranchInfo* bi = (BranchInfo*)(bp_history);
+    DPRINTF(LTage, "Deleting branch info: %lx\n", bi->branchPC);
+    if (bi->condBranch) {
+        if (bi->loopHit >= 0) {
+            int idx = bi->loopIndex + bi->loopHit;
+            ltable[idx].currentIterSpec = bi->currentIter;
+        }
+    }
+
+    delete bi;
+}
+
+bool
+LTAGE::lookup(ThreadID tid, Addr branch_pc, void* &bp_history)
+{
+    bool retval = predict(tid, branch_pc, true, bp_history);
+
+    DPRINTF(LTage, "Lookup branch: %lx; predict:%d\n", branch_pc, retval);
+    updateHistories(tid, branch_pc, retval, bp_history);
+    assert(threadHistory[tid].gHist ==
+           &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]);
+
+    return retval;
+}
+
+void
+LTAGE::btbUpdate(ThreadID tid, Addr branch_pc, void* &bp_history)
+{
+    BranchInfo* bi = (BranchInfo*) bp_history;
+    ThreadHistory& tHist = threadHistory[tid];
+    DPRINTF(LTage, "BTB miss resets prediction: %lx\n", branch_pc);
+    assert(tHist.gHist == &tHist.globalHistory[tHist.ptGhist]);
+    tHist.gHist[0] = 0;
+    for (int i = 1; i <= nHistoryTables; i++) {
+        tHist.computeIndices[i].comp = bi->ci[i];
+        tHist.computeTags[0][i].comp = bi->ct0[i];
+        tHist.computeTags[1][i].comp = bi->ct1[i];
+        tHist.computeIndices[i].update(tHist.gHist);
+        tHist.computeTags[0][i].update(tHist.gHist);
+        tHist.computeTags[1][i].update(tHist.gHist);
+    }
+}
+
+void
+LTAGE::uncondBranch(ThreadID tid, Addr br_pc, void* &bp_history)
+{
+    DPRINTF(LTage, "UnConditionalBranch: %lx\n", br_pc);
+    predict(tid, br_pc, false, bp_history);
+    updateHistories(tid, br_pc, true, bp_history);
+    assert(threadHistory[tid].gHist ==
+           &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]);
+}
+
+LTAGE*
+LTAGEParams::create()
+{
+    return new LTAGE(this);
+}
diff --git a/src/cpu/pred/ltage.hh b/src/cpu/pred/ltage.hh
new file mode 100644 (file)
index 0000000..60c3467
--- /dev/null
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2014 The University of Wisconsin
+ *
+ * Copyright (c) 2006 INRIA (Institut National de Recherche en
+ * Informatique et en Automatique  / French National Research Institute
+ * for Computer Science and Applied Mathematics)
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Vignyan Reddy, Dibakar Gope and Arthur Perais,
+ * from André Seznec's code.
+ */
+
+/* @file
+ * Implementation of a L-TAGE branch predictor. TAGE is a global-history based
+ * branch predictor. It features a PC-indexed bimodal predictor and N
+ * partially tagged tables, indexed with a hash of the PC and the global
+ * branch history. The different lengths of global branch history used to
+ * index the partially tagged tables grow geometrically. A small path history
+ * is also used in the hash. L-TAGE also features a loop predictor that records
+ * iteration count of loops and predicts accordingly.
+ *
+ * All TAGE tables are accessed in parallel, and the one using the longest
+ * history that matches provides the prediction (some exceptions apply).
+ * Entries are allocated in components using a longer history than the
+ * one that predicted when the prediction is incorrect.
+ */
+
+#ifndef __CPU_PRED_LTAGE
+#define __CPU_PRED_LTAGE
+
+#include <vector>
+
+#include "base/types.hh"
+#include "cpu/pred/bpred_unit.hh"
+#include "params/LTAGE.hh"
+
+class LTAGE: public BPredUnit
+{
+  public:
+    LTAGE(const LTAGEParams *params);
+
+    // Base class methods.
+    void uncondBranch(ThreadID tid, Addr br_pc, void* &bp_history) override;
+    bool lookup(ThreadID tid, Addr branch_addr, void* &bp_history) override;
+    void btbUpdate(ThreadID tid, Addr branch_addr, void* &bp_history) override;
+    void update(ThreadID tid, Addr branch_addr, bool taken, void *bp_history,
+                bool squashed) override;
+    void squash(ThreadID tid, void *bp_history) override;
+    unsigned getGHR(ThreadID tid, void *bp_history) const override;
+
+  private:
+    // Prediction Structures
+    // Loop Predictor Entry
+    struct LoopEntry
+    {
+        uint16_t numIter;
+        uint16_t currentIter;
+        uint16_t currentIterSpec;
+        uint8_t confidence;
+        uint16_t tag;
+        uint8_t age;
+        bool dir;
+
+        LoopEntry() : numIter(0), currentIter(0), currentIterSpec(0),
+                      confidence(0), tag(0), age(0), dir(0) { }
+    };
+
+    // Bimodal Predictor Entry
+    struct BimodalEntry
+    {
+        uint8_t pred;
+        uint8_t hyst;
+
+        BimodalEntry() : pred(0), hyst(1) { }
+    };
+
+    // Tage Entry
+    struct TageEntry
+    {
+        int8_t ctr;
+        uint16_t tag;
+        int8_t u;
+        TageEntry() : ctr(0), tag(0), u(0) { }
+    };
+
+    // Folded History Table - compressed history
+    // to mix with instruction PC to index partially
+    // tagged tables.
+    struct FoldedHistory
+    {
+        unsigned comp;
+        int compLength;
+        int origLength;
+        int outpoint;
+
+        void init(int original_length, int compressed_length)
+        {
+            comp = 0;
+            origLength = original_length;
+            compLength = compressed_length;
+            outpoint = original_length % compressed_length;
+        }
+
+        void update(uint8_t * h)
+        {
+            comp = (comp << 1) | h[0];
+            comp ^= h[origLength] << outpoint;
+            comp ^= (comp >> compLength);
+            comp &= (ULL(1) << compLength) - 1;
+        }
+    };
+
+    // Primary branch history entry
+    struct BranchInfo
+    {
+        int pathHist;
+        int ptGhist;
+        int hitBank;
+        int hitBankIndex;
+        int altBank;
+        int altBankIndex;
+        int bimodalIndex;
+        int loopTag;
+        uint16_t currentIter;
+
+        bool tagePred;
+        bool altTaken;
+        bool loopPred;
+        bool loopPredValid;
+        int  loopIndex;
+        int loopHit;
+        bool condBranch;
+        bool longestMatchPred;
+        bool pseudoNewAlloc;
+        Addr branchPC;
+
+        // Pointer to dynamically allocated storage
+        // to save table indices and folded histories.
+        // To do one call to new instead of five.
+        int *storage;
+
+        // Pointers to actual saved array within the dynamically
+        // allocated storage.
+        int *tableIndices;
+        int *tableTags;
+        int *ci;
+        int *ct0;
+        int *ct1;
+
+        BranchInfo(int sz)
+            : pathHist(0), ptGhist(0),
+              hitBank(0), hitBankIndex(0),
+              altBank(0), altBankIndex(0),
+              bimodalIndex(0), loopTag(0), currentIter(0),
+              tagePred(false), altTaken(false), loopPred(false),
+              loopPredValid(false), loopIndex(0), loopHit(0),
+              condBranch(false), longestMatchPred(false),
+              pseudoNewAlloc(false), branchPC(0)
+        {
+            storage = new int [sz * 5];
+            tableIndices = storage;
+            tableTags = storage + sz;
+            ci = tableTags + sz;
+            ct0 = ci + sz;
+            ct1 = ct0 + sz;
+        }
+
+        ~BranchInfo()
+        {
+            delete[] storage;
+        }
+    };
+
+    /**
+     * Computes the index used to access the
+     * bimodal table.
+     * @param pc_in The unshifted branch PC.
+     */
+    int bindex(Addr pc_in) const;
+
+    /**
+     * Computes the index used to access the
+     * loop predictor.
+     * @param pc_in The unshifted branch PC.
+     */
+    int lindex(Addr pc_in) const;
+
+    /**
+     * Computes the index used to access a
+     * partially tagged table.
+     * @param tid The thread ID used to select the
+     * global histories to use.
+     * @param pc The unshifted branch PC.
+     * @param bank The partially tagged table to access.
+     */
+    inline int gindex(ThreadID tid, Addr pc, int bank) const;
+
+    /**
+     * Utility function to shuffle the path history
+     * depending on which tagged table we are accessing.
+     * @param phist The path history.
+     * @param size Number of path history bits to use.
+     * @param bank The partially tagged table to access.
+     */
+    int F(int phist, int size, int bank) const;
+
+    /**
+     * Computes the partial tag of a tagged table.
+     * @param tid the thread ID used to select the
+     * global histories to use.
+     * @param pc The unshifted branch PC.
+     * @param bank The partially tagged table to access.
+     */
+    inline uint16_t gtag(ThreadID tid, Addr pc, int bank) const;
+
+    /**
+     * Updates a direction counter based on the actual
+     * branch outcome.
+     * @param ctr Reference to counter to update.
+     * @param taken Actual branch outcome.
+     * @param nbits Counter width.
+     */
+    void ctrUpdate(int8_t & ctr, bool taken, int nbits);
+
+    /**
+     * Get a branch prediction from the bimodal
+     * predictor.
+     * @param pc The unshifted branch PC.
+     * @param bi Pointer to information on the
+     * prediction.
+     */
+    bool getBimodePred(Addr pc, BranchInfo* bi) const;
+
+    /**
+     * Updates the bimodal predictor.
+     * @param pc The unshifted branch PC.
+     * @param taken The actual branch outcome.
+     * @param bi Pointer to information on the prediction
+     * recorded at prediction time.
+     */
+    void baseUpdate(Addr pc, bool taken, BranchInfo* bi);
+
+    /**
+     * Get a branch prediction from the loop
+     * predictor.
+     * @param pc The unshifted branch PC.
+     * @param bi Pointer to information on the
+     * prediction.
+     */
+    bool getLoop(Addr pc, BranchInfo* bi) const;
+
+   /**
+    * Updates the loop predictor.
+    * @param pc The unshifted branch PC.
+    * @param taken The actual branch outcome.
+    * @param bi Pointer to information on the
+    * prediction recorded at prediction time.
+    */
+    void loopUpdate(Addr pc, bool Taken, BranchInfo* bi);
+
+   /**
+    * (Speculatively) updates the global branch history.
+    * @param h Reference to pointer to global branch history.
+    * @param dir (Predicted) outcome to update the histories
+    * with.
+    * @param tab
+    * @param PT Reference to path history.
+    */
+    void updateGHist(uint8_t * &h, bool dir, uint8_t * tab, int &PT);
+
+    /**
+     * Get a branch prediction from L-TAGE. *NOT* an override of
+     * BpredUnit::predict().
+     * @param tid The thread ID to select the global
+     * histories to use.
+     * @param branch_pc The unshifted branch PC.
+     * @param cond_branch True if the branch is conditional.
+     * @param b Reference to wrapping pointer to allow storing
+     * derived class prediction information in the base class.
+     */
+    bool predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b);
+
+    /**
+     * Update L-TAGE. Called at execute to repair histories on a misprediction
+     * and at commit to update the tables.
+     * @param tid The thread ID to select the global
+     * histories to use.
+     * @param branch_pc The unshifted branch PC.
+     * @param taken Actual branch outcome.
+     * @param bi Pointer to information on the prediction
+     * recorded at prediction time.
+     */
+    void update(ThreadID tid, Addr branch_pc, bool taken, BranchInfo* bi);
+
+   /**
+    * (Speculatively) updates global histories (path and direction).
+    * Also recomputes compressed (folded) histories based on the
+    * branch direction.
+    * @param tid The thread ID to select the histories
+    * to update.
+    * @param branch_pc The unshifted branch PC.
+    * @param taken (Predicted) branch direction.
+    * @param b Wrapping pointer to BranchInfo (to allow
+    * storing derived class prediction information in the
+    * base class).
+    */
+    void updateHistories(ThreadID tid, Addr branch_pc, bool taken, void* b);
+
+    /**
+     * Restores speculatively updated path and direction histories.
+     * Also recomputes compressed (folded) histories based on the
+     * correct branch outcome.
+     * This version of squash() is called once on a branch misprediction.
+     * @param tid The Thread ID to select the histories to rollback.
+     * @param taken The correct branch outcome.
+     * @param bp_history Wrapping pointer to BranchInfo (to allow
+     * storing derived class prediction information in the
+     * base class).
+     * @post bp_history points to valid memory.
+     */
+    void squash(ThreadID tid, bool taken, void *bp_history);
+
+    /**
+     * Speculatively updates the loop predictor
+     * iteration count.
+     * @param pc The unshifted branch PC.
+     * @param taken The predicted branch outcome.
+     * @param bi Pointer to information on the prediction
+     * recorded at prediction time.
+     */
+    void specLoopUpdate(Addr pc, bool taken, BranchInfo* bi);
+
+    const unsigned logSizeBiMP;
+    const unsigned logSizeTagTables;
+    const unsigned logSizeLoopPred;
+    const unsigned nHistoryTables;
+    const unsigned tagTableCounterBits;
+    const unsigned histBufferSize;
+    const unsigned minHist;
+    const unsigned maxHist;
+    const unsigned minTagWidth;
+
+    BimodalEntry *btable;
+    TageEntry **gtable;
+    LoopEntry *ltable;
+
+    // Keep per-thread histories to
+    // support SMT.
+    struct ThreadHistory {
+        // Speculative path history
+        // (LSB of branch address)
+        int pathHist;
+
+        // Speculative branch direction
+        // history (circular buffer)
+        // @TODO Convert to std::vector<bool>
+        uint8_t *globalHistory;
+
+        // Pointer to most recent branch outcome
+        uint8_t* gHist;
+
+        // Index to most recent branch outcome
+        int ptGhist;
+
+        // Speculative folded histories.
+        FoldedHistory *computeIndices;
+        FoldedHistory *computeTags[2];
+    };
+
+    std::vector<ThreadHistory> threadHistory;
+
+    int tagWidths[15];
+    int tagTableSizes[15];
+    int *histLengths;
+    int *tableIndices;
+    int *tableTags;
+
+    int8_t loopUseCounter;
+    int8_t useAltPredForNewlyAllocated;
+    int tCounter;
+    int logTick;
+};
+
+#endif // __CPU_PRED_LTAGE