From c9d933efb04ec8bb167b8405f56283804b12c0fe Mon Sep 17 00:00:00 2001 From: Arthur Perais Date: Wed, 21 Dec 2016 15:25:13 -0600 Subject: [PATCH] cpu: implement an L-TAGE branch predictor MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch implements an L-TAGE predictor, based on André Seznec's code available from CBP-2 (http://hpca23.cse.tamu.edu/taco/camino/cbp2/cbp-src/realistic-seznec.h). Signed-off-by Jason Lowe-Power --- src/cpu/pred/BranchPredictor.py | 16 + src/cpu/pred/SConscript | 1 + src/cpu/pred/ltage.cc | 746 ++++++++++++++++++++++++++++++++ src/cpu/pred/ltage.hh | 408 +++++++++++++++++ 4 files changed, 1171 insertions(+) create mode 100644 src/cpu/pred/ltage.cc create mode 100644 src/cpu/pred/ltage.hh diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 2d7d0d0e2..1b400c2b7 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -86,3 +86,19 @@ class BiModeBP(BranchPredictor): choicePredictorSize = Param.Unsigned(8192, "Size of choice predictor") choiceCtrBits = Param.Unsigned(2, "Bits of choice counters") +class LTAGE(BranchPredictor): + type = 'LTAGE' + cxx_class = 'LTAGE' + cxx_header = "cpu/pred/ltage.hh" + + logSizeBiMP = Param.Unsigned(14, "Log size of Bimodal predictor in bits") + logSizeTagTables = Param.Unsigned(11, "Log size of tag table in LTAGE") + logSizeLoopPred = Param.Unsigned(8, "Log size of the loop predictor") + nHistoryTables = Param.Unsigned(12, "Number of history tables") + tagTableCounterBits = Param.Unsigned(3, "Number of tag table counter bits") + histBufferSize = Param.Unsigned(2097152, + "A large number to track all branch histories(2MEntries default)") + minHist = Param.Unsigned(4, "Minimum history size of LTAGE") + maxHist = Param.Unsigned(640, "Maximum history size of LTAGE") + minTagWidth = Param.Unsigned(7, "Minimum tag size in tag tables") + diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript index dca5e8d88..1cdf7bbd7 100644 --- a/src/cpu/pred/SConscript +++ b/src/cpu/pred/SConscript @@ -43,6 +43,7 @@ Source('indirect.cc') Source('ras.cc') Source('tournament.cc') Source ('bi_mode.cc') +Source('ltage.cc') DebugFlag('FreeList') DebugFlag('Branch') DebugFlag('LTage') diff --git a/src/cpu/pred/ltage.cc b/src/cpu/pred/ltage.cc new file mode 100644 index 000000000..84fc9a404 --- /dev/null +++ b/src/cpu/pred/ltage.cc @@ -0,0 +1,746 @@ +/* + * Copyright (c) 2014 The University of Wisconsin + * + * Copyright (c) 2006 INRIA (Institut National de Recherche en + * Informatique et en Automatique / French National Research Institute + * for Computer Science and Applied Mathematics) + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Vignyan Reddy, Dibakar Gope and Arthur Perais, + * from André Seznec's code. + */ + +/* @file + * Implementation of a L-TAGE branch predictor + */ + +#include "cpu/pred/ltage.hh" + +#include "base/intmath.hh" +#include "base/misc.hh" +#include "base/random.hh" +#include "base/trace.hh" +#include "debug/Fetch.hh" +#include "debug/LTage.hh" + +LTAGE::LTAGE(const LTAGEParams *params) + : BPredUnit(params), + logSizeBiMP(params->logSizeBiMP), + logSizeTagTables(params->logSizeTagTables), + logSizeLoopPred(params->logSizeLoopPred), + nHistoryTables(params->nHistoryTables), + tagTableCounterBits(params->tagTableCounterBits), + histBufferSize(params->histBufferSize), + minHist(params->minHist), + maxHist(params->maxHist), + minTagWidth(params->minTagWidth), + threadHistory(params->numThreads) +{ + assert(params->histBufferSize > params->maxHist * 2); + useAltPredForNewlyAllocated = 0; + logTick = 19; + tCounter = ULL(1) << (logTick - 1); + + for (auto& history : threadHistory) { + history.pathHist = 0; + history.globalHistory = new uint8_t[histBufferSize]; + history.gHist = history.globalHistory; + memset(history.gHist, 0, histBufferSize); + history.ptGhist = 0; + } + + histLengths = new int [nHistoryTables+1]; + histLengths[1] = minHist; + histLengths[nHistoryTables] = maxHist; + + for (int i = 2; i <= nHistoryTables; i++) { + histLengths[i] = (int) (((double) minHist * + pow ((double) (maxHist) / (double) minHist, + (double) (i - 1) / (double) ((nHistoryTables- 1)))) + + 0.5); + } + + tagWidths[1] = minTagWidth; + tagWidths[2] = minTagWidth; + tagWidths[3] = minTagWidth + 1; + tagWidths[4] = minTagWidth + 1; + tagWidths[5] = minTagWidth + 2; + tagWidths[6] = minTagWidth + 3; + tagWidths[7] = minTagWidth + 4; + tagWidths[8] = minTagWidth + 5; + tagWidths[9] = minTagWidth + 5; + tagWidths[10] = minTagWidth + 6; + tagWidths[11] = minTagWidth + 7; + tagWidths[12] = minTagWidth + 8; + + for (int i = 1; i <= 2; i++) + tagTableSizes[i] = logSizeTagTables - 1; + for (int i = 3; i <= 6; i++) + tagTableSizes[i] = logSizeTagTables; + for (int i = 7; i <= 10; i++) + tagTableSizes[i] = logSizeTagTables - 1; + for (int i = 11; i <= 12; i++) + tagTableSizes[i] = logSizeTagTables - 2; + + for (auto& history : threadHistory) { + history.computeIndices = new FoldedHistory[nHistoryTables+1]; + history.computeTags[0] = new FoldedHistory[nHistoryTables+1]; + history.computeTags[1] = new FoldedHistory[nHistoryTables+1]; + + for (int i = 1; i <= nHistoryTables; i++) { + history.computeIndices[i].init(histLengths[i], (tagTableSizes[i])); + history.computeTags[0][i].init( + history.computeIndices[i].origLength, tagWidths[i]); + history.computeTags[1][i].init( + history.computeIndices[i].origLength, tagWidths[i] - 1); + DPRINTF(LTage, "HistLength:%d, TTSize:%d, TTTWidth:%d\n", + histLengths[i], tagTableSizes[i], tagWidths[i]); + } + } + + btable = new BimodalEntry[ULL(1) << logSizeBiMP]; + ltable = new LoopEntry[ULL(1) << logSizeLoopPred]; + gtable = new TageEntry*[nHistoryTables + 1]; + for (int i = 1; i <= nHistoryTables; i++) { + gtable[i] = new TageEntry[1<<(tagTableSizes[i])]; + } + + tableIndices = new int [nHistoryTables+1]; + tableTags = new int [nHistoryTables+1]; + + loopUseCounter = 0; +} + +int +LTAGE::bindex(Addr pc_in) const +{ + return ((pc_in) & ((ULL(1) << (logSizeBiMP)) - 1)); +} + +int +LTAGE::lindex(Addr pc_in) const +{ + return (((pc_in) & ((ULL(1) << (logSizeLoopPred - 2)) - 1)) << 2); +} + +int +LTAGE::F(int A, int size, int bank) const +{ + int A1, A2; + + A = A & ((ULL(1) << size) - 1); + A1 = (A & ((ULL(1) << tagTableSizes[bank]) - 1)); + A2 = (A >> tagTableSizes[bank]); + A2 = ((A2 << bank) & ((ULL(1) << tagTableSizes[bank]) - 1)) + + (A2 >> (tagTableSizes[bank] - bank)); + A = A1 ^ A2; + A = ((A << bank) & ((ULL(1) << tagTableSizes[bank]) - 1)) + + (A >> (tagTableSizes[bank] - bank)); + return (A); +} + + +// gindex computes a full hash of pc, ghist and pathHist +int +LTAGE::gindex(ThreadID tid, Addr pc, int bank) const +{ + int index; + int hlen = (histLengths[bank] > 16) ? 16 : histLengths[bank]; + index = + (pc) ^ ((pc) >> ((int) abs(tagTableSizes[bank] - bank) + 1)) ^ + threadHistory[tid].computeIndices[bank].comp ^ + F(threadHistory[tid].pathHist, hlen, bank); + + return (index & ((ULL(1) << (tagTableSizes[bank])) - 1)); +} + + +// Tag computation +uint16_t +LTAGE::gtag(ThreadID tid, Addr pc, int bank) const +{ + int tag = (pc) ^ threadHistory[tid].computeTags[0][bank].comp + ^ (threadHistory[tid].computeTags[1][bank].comp << 1); + + return (tag & ((ULL(1) << tagWidths[bank]) - 1)); +} + + +// Up-down saturating counter +void +LTAGE::ctrUpdate(int8_t & ctr, bool taken, int nbits) +{ + assert(nbits <= sizeof(int8_t) << 3); + if (taken) { + if (ctr < ((1 << (nbits - 1)) - 1)) + ctr++; + } else { + if (ctr > -(1 << (nbits - 1))) + ctr--; + } +} + +// Bimodal prediction +bool +LTAGE::getBimodePred(Addr pc, BranchInfo* bi) const +{ + return (btable[bi->bimodalIndex].pred > 0); +} + + +// Update the bimodal predictor: a hysteresis bit is shared among 4 prediction +// bits +void +LTAGE::baseUpdate(Addr pc, bool taken, BranchInfo* bi) +{ + int inter = (btable[bi->bimodalIndex].pred << 1) + + btable[bi->bimodalIndex ].hyst; + if (taken) { + if (inter < 3) + inter++; + } else if (inter > 0) { + inter--; + } + btable[bi->bimodalIndex].pred = inter >> 1; + btable[bi->bimodalIndex].hyst = (inter & 1); + DPRINTF(LTage, "Updating branch %lx, pred:%d, hyst:%d\n", + pc, btable[bi->bimodalIndex].pred,btable[bi->bimodalIndex].hyst); +} + + +//loop prediction: only used if high confidence +bool +LTAGE::getLoop(Addr pc, BranchInfo* bi) const +{ + bi->loopHit = -1; + bi->loopPredValid = false; + bi->loopIndex = lindex(pc); + bi->loopTag = ((pc) >> (logSizeLoopPred - 2)); + + for (int i = 0; i < 4; i++) { + if (ltable[bi->loopIndex + i].tag == bi->loopTag) { + bi->loopHit = i; + bi->loopPredValid = (ltable[bi->loopIndex + i].confidence >= 3); + bi->currentIter = ltable[bi->loopIndex + i].currentIterSpec; + if (ltable[bi->loopIndex + i].currentIterSpec + 1 == + ltable[bi->loopIndex + i].numIter) { + return !(ltable[bi->loopIndex + i].dir); + }else { + return (ltable[bi->loopIndex + i].dir); + } + } + } + return false; +} + +void +LTAGE::specLoopUpdate(Addr pc, bool taken, BranchInfo* bi) +{ + if (bi->loopHit>=0) { + int index = lindex(pc); + if (taken != ltable[index].dir) { + ltable[index].currentIterSpec = 0; + } else { + ltable[index].currentIterSpec++; + } + } +} + +void +LTAGE::loopUpdate(Addr pc, bool taken, BranchInfo* bi) +{ + int idx = bi->loopIndex + bi->loopHit; + if (bi->loopHit >= 0) { + //already a hit + if (bi->loopPredValid) { + if (taken != bi->loopPred) { + // free the entry + ltable[idx].numIter = 0; + ltable[idx].age = 0; + ltable[idx].confidence = 0; + ltable[idx].currentIter = 0; + return; + } else if (bi->loopPred != bi->tagePred) { + DPRINTF(LTage, "Loop Prediction success:%lx\n",pc); + if (ltable[idx].age < 7) + ltable[idx].age++; + } + } + + ltable[idx].currentIter++; + if (ltable[idx].currentIter > ltable[idx].numIter) { + ltable[idx].confidence = 0; + if (ltable[idx].numIter != 0) { + // free the entry + ltable[idx].numIter = 0; + ltable[idx].age = 0; + ltable[idx].confidence = 0; + } + } + + if (taken != ltable[idx].dir) { + if (ltable[idx].currentIter == ltable[idx].numIter) { + DPRINTF(LTage, "Loop End predicted successfully:%lx\n", pc); + + if (ltable[idx].confidence < 7) { + ltable[idx].confidence++; + } + //just do not predict when the loop count is 1 or 2 + if (ltable[idx].numIter < 3) { + // free the entry + ltable[idx].dir = taken; + ltable[idx].numIter = 0; + ltable[idx].age = 0; + ltable[idx].confidence = 0; + } + } else { + DPRINTF(LTage, "Loop End predicted incorrectly:%lx\n", pc); + if (ltable[idx].numIter == 0) { + // first complete nest; + ltable[idx].confidence = 0; + ltable[idx].numIter = ltable[idx].currentIter; + } else { + //not the same number of iterations as last time: free the + //entry + ltable[idx].numIter = 0; + ltable[idx].age = 0; + ltable[idx].confidence = 0; + } + } + ltable[idx].currentIter = 0; + } + + } else if (taken) { + //try to allocate an entry on taken branch + int nrand = random_mt.random(); + for (int i = 0; i < 4; i++) { + int loop_hit = (nrand + i) & 3; + idx = bi->loopIndex + loop_hit; + if (ltable[idx].age == 0) { + DPRINTF(LTage, "Allocating loop pred entry for branch %lx\n", + pc); + ltable[idx].dir = !taken; + ltable[idx].tag = bi->loopTag; + ltable[idx].numIter = 0; + ltable[idx].age = 7; + ltable[idx].confidence = 0; + ltable[idx].currentIter = 1; + break; + + } + else + ltable[idx].age--; + } + } + +} + +// shifting the global history: we manage the history in a big table in order +// to reduce simulation time +void +LTAGE::updateGHist(uint8_t * &h, bool dir, uint8_t * tab, int &pt) +{ + if (pt == 0) { + DPRINTF(LTage, "Rolling over the histories\n"); + // Copy beginning of globalHistoryBuffer to end, such that + // the last maxHist outcomes are still reachable + // through pt[0 .. maxHist - 1]. + for (int i = 0; i < maxHist; i++) + tab[histBufferSize - maxHist + i] = tab[i]; + pt = histBufferSize - maxHist; + h = &tab[pt]; + } + pt--; + h--; + h[0] = (dir) ? 1 : 0; +} + +// Get GHR for hashing indirect predictor +// Build history backwards from pointer in +// bp_history. +unsigned +LTAGE::getGHR(ThreadID tid, void *bp_history) const +{ + BranchInfo* bi = static_cast(bp_history); + unsigned val = 0; + for (unsigned i = 0; i < 32; i++) { + // Make sure we don't go out of bounds + int gh_offset = bi->ptGhist + i; + assert(&(threadHistory[tid].globalHistory[gh_offset]) < + threadHistory[tid].globalHistory + histBufferSize); + val |= ((threadHistory[tid].globalHistory[gh_offset] & 0x1) << i); + } + + return val; +} + +//prediction +bool +LTAGE::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) +{ + BranchInfo *bi = new BranchInfo(nHistoryTables+1); + b = (void*)(bi); + Addr pc = branch_pc; + bool pred_taken = true; + bi->loopHit = -1; + + if (cond_branch) { + // TAGE prediction + + // computes the table addresses and the partial tags + for (int i = 1; i <= nHistoryTables; i++) { + tableIndices[i] = gindex(tid, pc, i); + bi->tableIndices[i] = tableIndices[i]; + tableTags[i] = gtag(tid, pc, i); + bi->tableTags[i] = tableTags[i]; + } + + bi->bimodalIndex = bindex(pc); + + bi->hitBank = 0; + bi->altBank = 0; + //Look for the bank with longest matching history + for (int i = nHistoryTables; i > 0; i--) { + if (gtable[i][tableIndices[i]].tag == tableTags[i]) { + bi->hitBank = i; + bi->hitBankIndex = tableIndices[bi->hitBank]; + break; + } + } + //Look for the alternate bank + for (int i = bi->hitBank - 1; i > 0; i--) { + if (gtable[i][tableIndices[i]].tag == tableTags[i]) { + bi->altBank = i; + bi->altBankIndex = tableIndices[bi->altBank]; + break; + } + } + //computes the prediction and the alternate prediction + if (bi->hitBank > 0) { + if (bi->altBank > 0) { + bi->altTaken = + gtable[bi->altBank][tableIndices[bi->altBank]].ctr >= 0; + }else { + bi->altTaken = getBimodePred(pc, bi); + } + + bi->longestMatchPred = + gtable[bi->hitBank][tableIndices[bi->hitBank]].ctr >= 0; + bi->pseudoNewAlloc = + abs(2 * gtable[bi->hitBank][bi->hitBankIndex].ctr + 1) <= 1; + + //if the entry is recognized as a newly allocated entry and + //useAltPredForNewlyAllocated is positive use the alternate + //prediction + if ((useAltPredForNewlyAllocated < 0) + || abs(2 * + gtable[bi->hitBank][tableIndices[bi->hitBank]].ctr + 1) > 1) + bi->tagePred = bi->longestMatchPred; + else + bi->tagePred = bi->altTaken; + } else { + bi->altTaken = getBimodePred(pc, bi); + bi->tagePred = bi->altTaken; + bi->longestMatchPred = bi->altTaken; + } + //end TAGE prediction + + bi->loopPred = getLoop(pc, bi); // loop prediction + + pred_taken = (((loopUseCounter >= 0) && bi->loopPredValid)) ? + (bi->loopPred): (bi->tagePred); + DPRINTF(LTage, "Predict for %lx: taken?:%d, loopTaken?:%d, " + "loopValid?:%d, loopUseCounter:%d, tagePred:%d, altPred:%d\n", + branch_pc, pred_taken, bi->loopPred, bi->loopPredValid, + loopUseCounter, bi->tagePred, bi->altTaken); + } + bi->branchPC = branch_pc; + bi->condBranch = cond_branch; + specLoopUpdate(branch_pc, pred_taken, bi); + return pred_taken; +} + +// PREDICTOR UPDATE +void +LTAGE::update(ThreadID tid, Addr branch_pc, bool taken, void* bp_history, + bool squashed) +{ + assert(bp_history); + + BranchInfo *bi = static_cast(bp_history); + + if (squashed) { + // This restores the global history, then update it + // and recomputes the folded histories. + squash(tid, taken, bp_history); + return; + } + + int nrand = random_mt.random(0,3); + Addr pc = branch_pc; + if (bi->condBranch) { + DPRINTF(LTage, "Updating tables for branch:%lx; taken?:%d\n", + branch_pc, taken); + // first update the loop predictor + loopUpdate(pc, taken, bi); + + if (bi->loopPredValid) { + if (bi->tagePred != bi->loopPred) { + ctrUpdate(loopUseCounter, (bi->loopPred== taken), 7); + } + } + + // TAGE UPDATE + // try to allocate a new entries only if prediction was wrong + bool longest_match_pred = false; + bool alloc = (bi->tagePred != taken) && (bi->hitBank < nHistoryTables); + if (bi->hitBank > 0) { + // Manage the selection between longest matching and alternate + // matching for "pseudo"-newly allocated longest matching entry + longest_match_pred = bi->longestMatchPred; + bool PseudoNewAlloc = bi->pseudoNewAlloc; + // an entry is considered as newly allocated if its prediction + // counter is weak + if (PseudoNewAlloc) { + if (longest_match_pred == taken) { + alloc = false; + } + // if it was delivering the correct prediction, no need to + // allocate new entry even if the overall prediction was false + if (longest_match_pred != bi->altTaken) { + ctrUpdate(useAltPredForNewlyAllocated, + bi->altTaken == taken, 4); + } + } + } + + if (alloc) { + // is there some "unuseful" entry to allocate + int8_t min = 1; + for (int i = nHistoryTables; i > bi->hitBank; i--) { + if (gtable[i][bi->tableIndices[i]].u < min) { + min = gtable[i][bi->tableIndices[i]].u; + } + } + + // we allocate an entry with a longer history + // to avoid ping-pong, we do not choose systematically the next + // entry, but among the 3 next entries + int Y = nrand & + ((ULL(1) << (nHistoryTables - bi->hitBank - 1)) - 1); + int X = bi->hitBank + 1; + if (Y & 1) { + X++; + if (Y & 2) + X++; + } + // No entry available, forces one to be available + if (min > 0) { + gtable[X][bi->tableIndices[X]].u = 0; + } + + + //Allocate only one entry + for (int i = X; i <= nHistoryTables; i++) { + if ((gtable[i][bi->tableIndices[i]].u == 0)) { + gtable[i][bi->tableIndices[i]].tag = bi->tableTags[i]; + gtable[i][bi->tableIndices[i]].ctr = (taken) ? 0 : -1; + gtable[i][bi->tableIndices[i]].u = 0; //? + } + } + } + //periodic reset of u: reset is not complete but bit by bit + tCounter++; + if ((tCounter & ((ULL(1) << logTick) - 1)) == 0) { + // reset least significant bit + // most significant bit becomes least significant bit + for (int i = 1; i <= nHistoryTables; i++) { + for (int j = 0; j < (ULL(1) << tagTableSizes[i]); j++) { + gtable[i][j].u = gtable[i][j].u >> 1; + } + } + } + + if (bi->hitBank > 0) { + DPRINTF(LTage, "Updating tag table entry (%d,%d) for branch %lx\n", + bi->hitBank, bi->hitBankIndex, branch_pc); + ctrUpdate(gtable[bi->hitBank][bi->hitBankIndex].ctr, taken, + tagTableCounterBits); + // if the provider entry is not certified to be useful also update + // the alternate prediction + if (gtable[bi->hitBank][bi->hitBankIndex].u == 0) { + if (bi->altBank > 0) { + ctrUpdate(gtable[bi->altBank][bi->altBankIndex].ctr, taken, + tagTableCounterBits); + DPRINTF(LTage, "Updating tag table entry (%d,%d) for" + " branch %lx\n", bi->hitBank, bi->hitBankIndex, + branch_pc); + } + if (bi->altBank == 0) { + baseUpdate(pc, taken, bi); + } + } + + // update the u counter + if (longest_match_pred != bi->altTaken) { + if (longest_match_pred == taken) { + if (gtable[bi->hitBank][bi->hitBankIndex].u < 1) { + gtable[bi->hitBank][bi->hitBankIndex].u++; + } + } + } + } else { + baseUpdate(pc, taken, bi); + } + + //END PREDICTOR UPDATE + } + if (!squashed) { + delete bi; + } +} + +void +LTAGE::updateHistories(ThreadID tid, Addr branch_pc, bool taken, void* b) +{ + BranchInfo* bi = (BranchInfo*)(b); + ThreadHistory& tHist = threadHistory[tid]; + // UPDATE HISTORIES + bool pathbit = ((branch_pc) & 1); + //on a squash, return pointers to this and recompute indices. + //update user history + updateGHist(tHist.gHist, taken, tHist.globalHistory, tHist.ptGhist); + tHist.pathHist = (tHist.pathHist << 1) + pathbit; + tHist.pathHist = (tHist.pathHist & ((ULL(1) << 16) - 1)); + + bi->ptGhist = tHist.ptGhist; + bi->pathHist = tHist.pathHist; + //prepare next index and tag computations for user branchs + for (int i = 1; i <= nHistoryTables; i++) + { + bi->ci[i] = tHist.computeIndices[i].comp; + bi->ct0[i] = tHist.computeTags[0][i].comp; + bi->ct1[i] = tHist.computeTags[1][i].comp; + tHist.computeIndices[i].update(tHist.gHist); + tHist.computeTags[0][i].update(tHist.gHist); + tHist.computeTags[1][i].update(tHist.gHist); + } + DPRINTF(LTage, "Updating global histories with branch:%lx; taken?:%d, " + "path Hist: %x; pointer:%d\n", branch_pc, taken, tHist.pathHist, + tHist.ptGhist); +} + +void +LTAGE::squash(ThreadID tid, bool taken, void *bp_history) +{ + BranchInfo* bi = (BranchInfo*)(bp_history); + ThreadHistory& tHist = threadHistory[tid]; + DPRINTF(LTage, "Restoring branch info: %lx; taken? %d; PathHistory:%x, " + "pointer:%d\n", bi->branchPC,taken, bi->pathHist, bi->ptGhist); + tHist.pathHist = bi->pathHist; + tHist.ptGhist = bi->ptGhist; + tHist.gHist = &(tHist.globalHistory[tHist.ptGhist]); + tHist.gHist[0] = (taken ? 1 : 0); + for (int i = 1; i <= nHistoryTables; i++) { + tHist.computeIndices[i].comp = bi->ci[i]; + tHist.computeTags[0][i].comp = bi->ct0[i]; + tHist.computeTags[1][i].comp = bi->ct1[i]; + tHist.computeIndices[i].update(tHist.gHist); + tHist.computeTags[0][i].update(tHist.gHist); + tHist.computeTags[1][i].update(tHist.gHist); + } + + if (bi->condBranch) { + if (bi->loopHit >= 0) { + int idx = bi->loopIndex + bi->loopHit; + ltable[idx].currentIterSpec = bi->currentIter; + } + } + +} + +void +LTAGE::squash(ThreadID tid, void *bp_history) +{ + BranchInfo* bi = (BranchInfo*)(bp_history); + DPRINTF(LTage, "Deleting branch info: %lx\n", bi->branchPC); + if (bi->condBranch) { + if (bi->loopHit >= 0) { + int idx = bi->loopIndex + bi->loopHit; + ltable[idx].currentIterSpec = bi->currentIter; + } + } + + delete bi; +} + +bool +LTAGE::lookup(ThreadID tid, Addr branch_pc, void* &bp_history) +{ + bool retval = predict(tid, branch_pc, true, bp_history); + + DPRINTF(LTage, "Lookup branch: %lx; predict:%d\n", branch_pc, retval); + updateHistories(tid, branch_pc, retval, bp_history); + assert(threadHistory[tid].gHist == + &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]); + + return retval; +} + +void +LTAGE::btbUpdate(ThreadID tid, Addr branch_pc, void* &bp_history) +{ + BranchInfo* bi = (BranchInfo*) bp_history; + ThreadHistory& tHist = threadHistory[tid]; + DPRINTF(LTage, "BTB miss resets prediction: %lx\n", branch_pc); + assert(tHist.gHist == &tHist.globalHistory[tHist.ptGhist]); + tHist.gHist[0] = 0; + for (int i = 1; i <= nHistoryTables; i++) { + tHist.computeIndices[i].comp = bi->ci[i]; + tHist.computeTags[0][i].comp = bi->ct0[i]; + tHist.computeTags[1][i].comp = bi->ct1[i]; + tHist.computeIndices[i].update(tHist.gHist); + tHist.computeTags[0][i].update(tHist.gHist); + tHist.computeTags[1][i].update(tHist.gHist); + } +} + +void +LTAGE::uncondBranch(ThreadID tid, Addr br_pc, void* &bp_history) +{ + DPRINTF(LTage, "UnConditionalBranch: %lx\n", br_pc); + predict(tid, br_pc, false, bp_history); + updateHistories(tid, br_pc, true, bp_history); + assert(threadHistory[tid].gHist == + &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]); +} + +LTAGE* +LTAGEParams::create() +{ + return new LTAGE(this); +} diff --git a/src/cpu/pred/ltage.hh b/src/cpu/pred/ltage.hh new file mode 100644 index 000000000..60c34679e --- /dev/null +++ b/src/cpu/pred/ltage.hh @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2014 The University of Wisconsin + * + * Copyright (c) 2006 INRIA (Institut National de Recherche en + * Informatique et en Automatique / French National Research Institute + * for Computer Science and Applied Mathematics) + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Vignyan Reddy, Dibakar Gope and Arthur Perais, + * from André Seznec's code. + */ + +/* @file + * Implementation of a L-TAGE branch predictor. TAGE is a global-history based + * branch predictor. It features a PC-indexed bimodal predictor and N + * partially tagged tables, indexed with a hash of the PC and the global + * branch history. The different lengths of global branch history used to + * index the partially tagged tables grow geometrically. A small path history + * is also used in the hash. L-TAGE also features a loop predictor that records + * iteration count of loops and predicts accordingly. + * + * All TAGE tables are accessed in parallel, and the one using the longest + * history that matches provides the prediction (some exceptions apply). + * Entries are allocated in components using a longer history than the + * one that predicted when the prediction is incorrect. + */ + +#ifndef __CPU_PRED_LTAGE +#define __CPU_PRED_LTAGE + +#include + +#include "base/types.hh" +#include "cpu/pred/bpred_unit.hh" +#include "params/LTAGE.hh" + +class LTAGE: public BPredUnit +{ + public: + LTAGE(const LTAGEParams *params); + + // Base class methods. + void uncondBranch(ThreadID tid, Addr br_pc, void* &bp_history) override; + bool lookup(ThreadID tid, Addr branch_addr, void* &bp_history) override; + void btbUpdate(ThreadID tid, Addr branch_addr, void* &bp_history) override; + void update(ThreadID tid, Addr branch_addr, bool taken, void *bp_history, + bool squashed) override; + void squash(ThreadID tid, void *bp_history) override; + unsigned getGHR(ThreadID tid, void *bp_history) const override; + + private: + // Prediction Structures + // Loop Predictor Entry + struct LoopEntry + { + uint16_t numIter; + uint16_t currentIter; + uint16_t currentIterSpec; + uint8_t confidence; + uint16_t tag; + uint8_t age; + bool dir; + + LoopEntry() : numIter(0), currentIter(0), currentIterSpec(0), + confidence(0), tag(0), age(0), dir(0) { } + }; + + // Bimodal Predictor Entry + struct BimodalEntry + { + uint8_t pred; + uint8_t hyst; + + BimodalEntry() : pred(0), hyst(1) { } + }; + + // Tage Entry + struct TageEntry + { + int8_t ctr; + uint16_t tag; + int8_t u; + TageEntry() : ctr(0), tag(0), u(0) { } + }; + + // Folded History Table - compressed history + // to mix with instruction PC to index partially + // tagged tables. + struct FoldedHistory + { + unsigned comp; + int compLength; + int origLength; + int outpoint; + + void init(int original_length, int compressed_length) + { + comp = 0; + origLength = original_length; + compLength = compressed_length; + outpoint = original_length % compressed_length; + } + + void update(uint8_t * h) + { + comp = (comp << 1) | h[0]; + comp ^= h[origLength] << outpoint; + comp ^= (comp >> compLength); + comp &= (ULL(1) << compLength) - 1; + } + }; + + // Primary branch history entry + struct BranchInfo + { + int pathHist; + int ptGhist; + int hitBank; + int hitBankIndex; + int altBank; + int altBankIndex; + int bimodalIndex; + int loopTag; + uint16_t currentIter; + + bool tagePred; + bool altTaken; + bool loopPred; + bool loopPredValid; + int loopIndex; + int loopHit; + bool condBranch; + bool longestMatchPred; + bool pseudoNewAlloc; + Addr branchPC; + + // Pointer to dynamically allocated storage + // to save table indices and folded histories. + // To do one call to new instead of five. + int *storage; + + // Pointers to actual saved array within the dynamically + // allocated storage. + int *tableIndices; + int *tableTags; + int *ci; + int *ct0; + int *ct1; + + BranchInfo(int sz) + : pathHist(0), ptGhist(0), + hitBank(0), hitBankIndex(0), + altBank(0), altBankIndex(0), + bimodalIndex(0), loopTag(0), currentIter(0), + tagePred(false), altTaken(false), loopPred(false), + loopPredValid(false), loopIndex(0), loopHit(0), + condBranch(false), longestMatchPred(false), + pseudoNewAlloc(false), branchPC(0) + { + storage = new int [sz * 5]; + tableIndices = storage; + tableTags = storage + sz; + ci = tableTags + sz; + ct0 = ci + sz; + ct1 = ct0 + sz; + } + + ~BranchInfo() + { + delete[] storage; + } + }; + + /** + * Computes the index used to access the + * bimodal table. + * @param pc_in The unshifted branch PC. + */ + int bindex(Addr pc_in) const; + + /** + * Computes the index used to access the + * loop predictor. + * @param pc_in The unshifted branch PC. + */ + int lindex(Addr pc_in) const; + + /** + * Computes the index used to access a + * partially tagged table. + * @param tid The thread ID used to select the + * global histories to use. + * @param pc The unshifted branch PC. + * @param bank The partially tagged table to access. + */ + inline int gindex(ThreadID tid, Addr pc, int bank) const; + + /** + * Utility function to shuffle the path history + * depending on which tagged table we are accessing. + * @param phist The path history. + * @param size Number of path history bits to use. + * @param bank The partially tagged table to access. + */ + int F(int phist, int size, int bank) const; + + /** + * Computes the partial tag of a tagged table. + * @param tid the thread ID used to select the + * global histories to use. + * @param pc The unshifted branch PC. + * @param bank The partially tagged table to access. + */ + inline uint16_t gtag(ThreadID tid, Addr pc, int bank) const; + + /** + * Updates a direction counter based on the actual + * branch outcome. + * @param ctr Reference to counter to update. + * @param taken Actual branch outcome. + * @param nbits Counter width. + */ + void ctrUpdate(int8_t & ctr, bool taken, int nbits); + + /** + * Get a branch prediction from the bimodal + * predictor. + * @param pc The unshifted branch PC. + * @param bi Pointer to information on the + * prediction. + */ + bool getBimodePred(Addr pc, BranchInfo* bi) const; + + /** + * Updates the bimodal predictor. + * @param pc The unshifted branch PC. + * @param taken The actual branch outcome. + * @param bi Pointer to information on the prediction + * recorded at prediction time. + */ + void baseUpdate(Addr pc, bool taken, BranchInfo* bi); + + /** + * Get a branch prediction from the loop + * predictor. + * @param pc The unshifted branch PC. + * @param bi Pointer to information on the + * prediction. + */ + bool getLoop(Addr pc, BranchInfo* bi) const; + + /** + * Updates the loop predictor. + * @param pc The unshifted branch PC. + * @param taken The actual branch outcome. + * @param bi Pointer to information on the + * prediction recorded at prediction time. + */ + void loopUpdate(Addr pc, bool Taken, BranchInfo* bi); + + /** + * (Speculatively) updates the global branch history. + * @param h Reference to pointer to global branch history. + * @param dir (Predicted) outcome to update the histories + * with. + * @param tab + * @param PT Reference to path history. + */ + void updateGHist(uint8_t * &h, bool dir, uint8_t * tab, int &PT); + + /** + * Get a branch prediction from L-TAGE. *NOT* an override of + * BpredUnit::predict(). + * @param tid The thread ID to select the global + * histories to use. + * @param branch_pc The unshifted branch PC. + * @param cond_branch True if the branch is conditional. + * @param b Reference to wrapping pointer to allow storing + * derived class prediction information in the base class. + */ + bool predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b); + + /** + * Update L-TAGE. Called at execute to repair histories on a misprediction + * and at commit to update the tables. + * @param tid The thread ID to select the global + * histories to use. + * @param branch_pc The unshifted branch PC. + * @param taken Actual branch outcome. + * @param bi Pointer to information on the prediction + * recorded at prediction time. + */ + void update(ThreadID tid, Addr branch_pc, bool taken, BranchInfo* bi); + + /** + * (Speculatively) updates global histories (path and direction). + * Also recomputes compressed (folded) histories based on the + * branch direction. + * @param tid The thread ID to select the histories + * to update. + * @param branch_pc The unshifted branch PC. + * @param taken (Predicted) branch direction. + * @param b Wrapping pointer to BranchInfo (to allow + * storing derived class prediction information in the + * base class). + */ + void updateHistories(ThreadID tid, Addr branch_pc, bool taken, void* b); + + /** + * Restores speculatively updated path and direction histories. + * Also recomputes compressed (folded) histories based on the + * correct branch outcome. + * This version of squash() is called once on a branch misprediction. + * @param tid The Thread ID to select the histories to rollback. + * @param taken The correct branch outcome. + * @param bp_history Wrapping pointer to BranchInfo (to allow + * storing derived class prediction information in the + * base class). + * @post bp_history points to valid memory. + */ + void squash(ThreadID tid, bool taken, void *bp_history); + + /** + * Speculatively updates the loop predictor + * iteration count. + * @param pc The unshifted branch PC. + * @param taken The predicted branch outcome. + * @param bi Pointer to information on the prediction + * recorded at prediction time. + */ + void specLoopUpdate(Addr pc, bool taken, BranchInfo* bi); + + const unsigned logSizeBiMP; + const unsigned logSizeTagTables; + const unsigned logSizeLoopPred; + const unsigned nHistoryTables; + const unsigned tagTableCounterBits; + const unsigned histBufferSize; + const unsigned minHist; + const unsigned maxHist; + const unsigned minTagWidth; + + BimodalEntry *btable; + TageEntry **gtable; + LoopEntry *ltable; + + // Keep per-thread histories to + // support SMT. + struct ThreadHistory { + // Speculative path history + // (LSB of branch address) + int pathHist; + + // Speculative branch direction + // history (circular buffer) + // @TODO Convert to std::vector + uint8_t *globalHistory; + + // Pointer to most recent branch outcome + uint8_t* gHist; + + // Index to most recent branch outcome + int ptGhist; + + // Speculative folded histories. + FoldedHistory *computeIndices; + FoldedHistory *computeTags[2]; + }; + + std::vector threadHistory; + + int tagWidths[15]; + int tagTableSizes[15]; + int *histLengths; + int *tableIndices; + int *tableTags; + + int8_t loopUseCounter; + int8_t useAltPredForNewlyAllocated; + int tCounter; + int logTick; +}; + +#endif // __CPU_PRED_LTAGE -- 2.30.2