Added frequent subcircuit miner to subcircuit library
authorClifford Wolf <clifford@clifford.at>
Sat, 2 Mar 2013 12:53:59 +0000 (13:53 +0100)
committerClifford Wolf <clifford@clifford.at>
Sat, 2 Mar 2013 12:53:59 +0000 (13:53 +0100)
libs/subcircuit/.gitignore [new file with mode: 0644]
libs/subcircuit/Makefile
libs/subcircuit/README
libs/subcircuit/scshell.cc
libs/subcircuit/subcircuit.cc
libs/subcircuit/subcircuit.h
libs/subcircuit/test_mine.txt [new file with mode: 0644]

diff --git a/libs/subcircuit/.gitignore b/libs/subcircuit/.gitignore
new file mode 100644 (file)
index 0000000..9f1eb4e
--- /dev/null
@@ -0,0 +1,2 @@
+demo
+scshell
index af745b4b60ff52f4dd5113d67b8adc04872cfb9f..f81085b5bc3da77c2ea820a8ce51508f2109b7b9 100644 (file)
@@ -39,6 +39,7 @@ scshell: scshell.o subcircuit.o
 
 test: scshell
        ./scshell < test_macc22.txt
+       ./scshell < test_mine.txt
        perl test_perm.pl | ./scshell
        splrun test_shorts.spl | ./scshell
        splrun test_large.spl | ./scshell
index 5c8a8a9e7a25798a08a2d407acbb05d978ecc17e..d1bdb1f646413aa588a18b447feee1934f5cc171 100644 (file)
@@ -14,20 +14,12 @@ Introduction
 
 This is a library that implements a modified Ullmann Subgraph Isomorphism
 Algorithm with additional features aimed at working with coarse grain logic
-networks.
+networks. It also contains a simple frequent subcircuit mining algorithm.
 
 A simple command line tool that exposes the features of the library is also
 included.
 
 
-Under-Construction Warning
---------------------------
-
-This work is under constructions. It is likely that they are bugs in the
-library that need fixing. Feel free to contact me at clifford@clifford.at
-if you have found a bug.
-
-
 C++11 Warning
 -------------
 
@@ -97,6 +89,9 @@ Algorithm are provided by the library.
 
  * Support for finding only non-overlapping matches.
 
+ * A simple miner for frequent subcircuts that operates on the same circuit
+   description format.
+
  * The public API of the library is using std::string identifiers for
    nodes, node types and ports. Internally the costly part of the
    algorithm is only using integer values, thus speeding up the
@@ -328,6 +323,32 @@ bool userCheckSolution(result):
        ignored. The default implementation always returns true.
 
 
+Mining for frequent SubCircuits
+-------------------------------
+
+The solver also contains a miner for frequent subcircuits. The following code
+fragment will find all frequent subcircuits with at least minNodes nodes and
+at most maxNodes nodes that occurs at least minMatches times: 
+
+       std::vector<SubCircuit::Solver::MineResult> results;
+       mySolver.mine(results, minNodes, maxNodes, minMatches);
+
+The miner works by finding frequent pairs of nodes and then combining them
+to larger subcircuits. Because of this incremental strategy the miner only
+works as expected on graphs with markAllExtern() set.
+
+The mine() method has an optional fifth parameter that limits the number
+of matches counted in one graph. This can be useful when mining for circuits
+that are found in at least a number of graphs. E.g. the following call
+would find all subcircuits with 5 nodes that are found in at least 7 of
+the registered graphs:
+
+       mySolver.mine(results, 5, 5, 7, 1);
+
+Note that this miner is not very efficient and therefore its use is not
+recommended for large circuits.
+
+
 Debugging
 ---------
 
@@ -420,6 +441,10 @@ The following commands can be used in scshell outside a graph ... endgraph block
                Call Solver::solve(). The <allow_overlap> must be "1" or "true"
                for true and "0" or "false" for false.
 
+       mine <min_nodes> <max_nodes> <min_matches> [<limit_matches_per_graph>]
+
+               Call Solver::mine().
+
        expect <number>
 
                Print all results so far since the last call to expect. Expect
index 70afcfd45aeced2c32123221f587986e22046ad5..c4b37a4defccf32fb69de8d14ca9ef0a07031b0b 100644 (file)
@@ -26,6 +26,7 @@ int main()
        SubCircuit::Solver solver;
        std::map<std::string, std::set<std::string>> initialMappings;
        std::vector<SubCircuit::Solver::Result> results;
+       std::vector<SubCircuit::Solver::MineResult> mineResults;
        std::vector<std::string> cmdBuffer;
        bool lastCommandExpect = false;
 
@@ -162,6 +163,12 @@ int main()
                                continue;
                        }
 
+                       if (cmdBuffer[0] == "mine" && 4 <= cmdBuffer.size() && cmdBuffer.size() <= 5) {
+                               solver.mine(mineResults, atoi(cmdBuffer[1].c_str()), atoi(cmdBuffer[2].c_str()),
+                                               atoi(cmdBuffer[3].c_str()), cmdBuffer.size() == 5 ? atoi(cmdBuffer[4].c_str()) : -1);
+                               continue;
+                       }
+
                        if (cmdBuffer[0] == "clearoverlap" && cmdBuffer.size() == 1) {
                                solver.clearOverlapHistory();
                                continue;
@@ -179,7 +186,7 @@ int main()
 
                        if (cmdBuffer[0] == "expect" && cmdBuffer.size() == 2) {
                                int expected = atoi(cmdBuffer[1].c_str());
-                               printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()));
+                               printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()) + int(mineResults.size()));
                                for (int i = 0; i < int(results.size()); i++) {
                                        printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str());
                                        for (const auto &it : results[i].mappings) {
@@ -189,9 +196,18 @@ int main()
                                                printf("\n");
                                        }
                                }
+                               for (auto &result : mineResults) {
+                                       printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits);
+                                       printf("  primary match in %s:", result.graphId.c_str());
+                                       for (auto &node : result.nodes)
+                                               printf(" %s", node.nodeId.c_str());
+                                       printf("\n");
+                                       for (auto &it : result.matchesPerGraph)
+                                               printf("  matches in %s: %d\n", it.first.c_str(), it.second);
+                               }
                                printf("\n");
-                               if (expected != int(results.size())) {
-                                       printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()));
+                               if (expected != int(results.size()) + int(mineResults.size())) {
+                                       printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()) + int(mineResults.size()));
                                        printf("   +----------------+\n");
                                        printf("   |  \\|/ ____ \\|/  |\n");
                                        printf("   |  \"@'/ ,. \\`@\"  |\n");
@@ -202,6 +218,7 @@ int main()
                                        return 1;
                                }
                                results.clear();
+                               mineResults.clear();
                                lastCommandExpect = true;
                                continue;
                        }
@@ -215,7 +232,7 @@ int main()
                delete graph;
 
        if (!lastCommandExpect) {
-               printf("\n-- Got %d --\n", int(results.size()));
+               printf("\n-- Got %d --\n", int(results.size()) + int(mineResults.size()));
                for (int i = 0; i < int(results.size()); i++) {
                        printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str());
                        for (const auto &it : results[i].mappings) {
@@ -225,6 +242,15 @@ int main()
                                printf("\n");
                        }
                }
+               for (auto &result : mineResults) {
+                       printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits);
+                       printf("  primary match in %s:", result.graphId.c_str());
+                       for (auto &node : result.nodes)
+                               printf(" %s", node.nodeId.c_str());
+                       printf("\n");
+                       for (auto &it : result.matchesPerGraph)
+                               printf("  matches in %s: %d\n", it.first.c_str(), it.second);
+               }
        } else
                printf("PASSED.\n");
 
index b49fa97cf5278f7f55531d98100204aecffc0bdc..a55b97ab415aa68beb4462756c9776c66fdd4df5 100644 (file)
@@ -46,6 +46,42 @@ static std::string stringf(const char *fmt, ...)
        return string;
 }
 
+SubCircuit::Graph::Graph(const Graph &other, const std::vector<std::string> &otherNodes)
+{
+       allExtern = other.allExtern;
+
+       std::map<int, int> other2this;
+       for (int i = 0; i < int(otherNodes.size()); i++) {
+               assert(other.nodeMap.count(otherNodes[i]) > 0);
+               other2this[other.nodeMap.at(otherNodes[i])] = i;
+               nodeMap[otherNodes[i]] = i;
+       }
+
+       std::map<int, int> edges2this;
+       for (auto &i1 : other2this)
+       for (auto &i2 : other.nodes[i1.first].ports)
+       for (auto &i3 : i2.bits)
+               if (edges2this.count(i3.edgeIdx) == 0)
+                       edges2this[i3.edgeIdx] = edges2this.size();
+
+       edges.resize(edges2this.size());
+       for (auto &it : edges2this) {
+               for (auto &bit : other.edges[it.first].portBits)
+                       if (other2this.count(bit.nodeIdx) > 0)
+                               edges[it.second].portBits.insert(BitRef(other2this[bit.nodeIdx], bit.portIdx, bit.bitIdx));
+               edges[it.second].constValue = other.edges[it.first].constValue;
+               edges[it.second].isExtern = other.edges[it.first].isExtern;
+       }
+
+       nodes.resize(other2this.size());
+       for (auto &it : other2this) {
+               nodes[it.second] = other.nodes[it.first];
+               for (auto &i2 : nodes[it.second].ports)
+               for (auto &i3 : i2.bits)
+                       i3.edgeIdx = edges2this.at(i3.edgeIdx);
+       }
+}
+
 bool SubCircuit::Graph::BitRef::operator < (const BitRef &other) const
 {
        if (nodeIdx != other.nodeIdx)
@@ -1072,6 +1108,197 @@ class SubCircuit::SolverWorker
                }
        }
 
+       // additional data structes and functions for mining
+
+       struct NodeSet {
+               std::string graphId;
+               std::set<int> nodes;
+               NodeSet(std::string graphId, int node1, int node2) {
+                       this->graphId = graphId;
+                       nodes.insert(node1);
+                       nodes.insert(node2);
+               }
+               NodeSet(std::string graphId, const std::vector<int> &nodes) {
+                       this->graphId = graphId;
+                       for (int node : nodes)
+                               this->nodes.insert(node);
+               }
+               void extend(const NodeSet &other) {
+                       assert(this->graphId == other.graphId);
+                       for (int node : other.nodes)
+                               nodes.insert(node);
+               }
+               int extendCandidate(const NodeSet &other) const {
+                       if (graphId != other.graphId)
+                               return 0;
+                       int newNodes = 0;
+                       bool intersect = false;
+                       for (int node : other.nodes)
+                               if (nodes.count(node) > 0)
+                                       intersect = true;
+                               else
+                                       newNodes++;
+                       return intersect ? newNodes : 0;
+               }
+               bool operator <(const NodeSet &other) const {
+                       if (graphId != other.graphId)
+                               return graphId < other.graphId;
+                       return nodes < other.nodes;
+               }
+       };
+
+       void solveForMining(std::vector<Solver::Result> &results, const GraphData &needle)
+       {
+               bool backupVerbose = verbose;
+               verbose = false;
+
+               for (auto &it : graphData)
+               {
+                       GraphData &haystack = it.second;
+                       assert(haystack.graph.allExtern);
+
+                       std::vector<std::set<int>> enumerationMatrix;
+                       std::map<std::string, std::set<std::string>> initialMappings;
+                       generateEnumerationMatrix(enumerationMatrix, needle, haystack, initialMappings);
+
+                       haystack.usedNodes.resize(haystack.graph.nodes.size());
+                       ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, true, -1);
+               }
+
+               verbose = backupVerbose;
+       }
+
+       int testForMining(std::vector<Solver::MineResult> &results, std::set<NodeSet> &usedSets, std::vector<std::set<NodeSet>> &nextPool, NodeSet &testSet,
+                       const std::string &graphId, const Graph &graph, int minNodes, int minMatches, int limitMatchesPerGraph)
+       {
+               GraphData needle;
+               std::vector<std::string> needle_nodes;
+               for (int nodeIdx : testSet.nodes)
+                       needle_nodes.push_back(graph.nodes[nodeIdx].nodeId);
+               needle.graph = Graph(graph, needle_nodes);
+               diCache.add(needle.graph, needle.adjMatrix, graphId, userSolver);
+
+               std::vector<Solver::Result> ullmannResults;
+               solveForMining(ullmannResults, needle);
+
+               int matches = 0;
+               std::map<std::string, int> matchesPerGraph;
+               std::set<NodeSet> thisNodeSetSet;
+
+               for (auto &it : ullmannResults)
+               {
+                       std::vector<int> resultNodes;
+                       for (auto &i2 : it.mappings)
+                               resultNodes.push_back(graphData[it.haystackGraphId].graph.nodeMap[i2.second.haystackNodeId]);
+                       NodeSet resultSet(it.haystackGraphId, resultNodes);
+
+                       if (usedSets.count(resultSet) > 0) {
+                               assert(thisNodeSetSet.count(resultSet) > 0);
+                               continue;
+                       }
+                       usedSets.insert(resultSet);
+                       thisNodeSetSet.insert(resultSet);
+
+                       matchesPerGraph[it.haystackGraphId]++;
+                       if (limitMatchesPerGraph < 0 || matchesPerGraph[it.haystackGraphId] < limitMatchesPerGraph)
+                               matches++;
+               }
+
+               if (matches < minMatches)
+                       return 0;
+
+               if (minNodes <= int(testSet.nodes.size()))
+               {
+                       Solver::MineResult result;
+                       result.graphId = graphId;
+                       result.totalMatchesAfterLimits = matches;
+                       result.matchesPerGraph = matchesPerGraph;
+                       for (int nodeIdx : testSet.nodes) {
+                               Solver::MineResultNode resultNode;
+                               resultNode.nodeId = graph.nodes[nodeIdx].nodeId;
+                               resultNode.userData = graph.nodes[nodeIdx].userData;
+                               result.nodes.push_back(resultNode);
+                       }
+                       results.push_back(result);
+               }
+
+               nextPool.push_back(thisNodeSetSet);
+               return matches;
+       }
+
+       void findNodePairs(std::vector<Solver::MineResult> &results, std::vector<std::set<NodeSet>> &nodePairs, int minNodes, int minMatches, int limitMatchesPerGraph)
+       {
+               std::set<NodeSet> usedPairs;
+
+               if (verbose)
+                       printf("\nFind frequent node pairs:\n");
+
+               for (auto &graph_it : graphData)
+               for (int node1 = 0; node1 < int(graph_it.second.graph.nodes.size()); node1++)
+               for (auto &adj_it : graph_it.second.adjMatrix.at(node1))
+               {
+                       const std::string &graphId = graph_it.first;
+                       const auto &graph = graph_it.second.graph;
+                       int node2 = adj_it.first;
+                       NodeSet pair(graphId, node1, node2);
+
+                       if (usedPairs.count(pair) > 0)
+                               continue;
+
+                       int matches = testForMining(results, usedPairs, nodePairs, pair, graphId, graph, minNodes, minMatches, limitMatchesPerGraph);
+
+                       if (verbose && matches > 0)
+                               printf("Pair %s[%s,%s] -> %d\n", graphId.c_str(), graph.nodes[node1].nodeId.c_str(),
+                                               graph.nodes[node2].nodeId.c_str(), matches);
+               }
+       }
+
+       void findNextPool(std::vector<Solver::MineResult> &results, std::vector<std::set<NodeSet>> &pool,
+                       int oldSetSize, int increment, int minNodes, int minMatches, int limitMatchesPerGraph)
+       {
+               std::vector<std::set<NodeSet>> nextPool;
+               std::map<std::string, std::vector<const NodeSet*>> poolPerGraph;
+
+               for (auto &i1 : pool)
+               for (auto &i2 : i1)
+                       poolPerGraph[i2.graphId].push_back(&i2);
+
+               if (verbose)
+                       printf("\nFind frequent subcircuits of size %d using increment %d:\n", oldSetSize+increment, increment);
+
+               std::set<NodeSet> usedSets;
+               for (auto &it : poolPerGraph)
+               for (int idx1 = 0; idx1 < int(it.second.size()); idx1++)
+               for (int idx2 = idx1; idx2 < int(it.second.size()); idx2++)
+               {
+                       if (it.second[idx1]->extendCandidate(*it.second[idx2]) != increment)
+                               continue;
+
+                       NodeSet mergedSet = *it.second[idx1];
+                       mergedSet.extend(*it.second[idx2]);
+
+                       if (usedSets.count(mergedSet) > 0)
+                               continue;
+
+                       const std::string &graphId = it.first;
+                       const auto &graph = graphData[it.first].graph;
+
+                       int matches = testForMining(results, usedSets, nextPool, mergedSet, graphId, graph, minNodes, minMatches, limitMatchesPerGraph);
+
+                       if (verbose) {
+                               printf("Set %s[", graphId.c_str());
+                               bool first = true;
+                               for (int nodeIdx : mergedSet.nodes) {
+                                       printf("%s%s", first ? "" : ",", graph.nodes[nodeIdx].nodeId.c_str());
+                                       first = false;
+                               }
+                               printf("] -> %d\n", matches);
+                       }
+               }
+
+               pool.swap(nextPool);
+       }
+
        // interface to the public Solver class
 
 protected:
@@ -1151,6 +1378,25 @@ protected:
                ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, allowOverlap, maxSolutions > 0 ? results.size() + maxSolutions : -1);
        }
 
+       void mine(std::vector<Solver::MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph)
+       {
+               int nodeSetSize = 2;
+               std::vector<std::set<NodeSet>> pool;
+               findNodePairs(results, pool, minNodes, minMatches, limitMatchesPerGraph);
+
+               while (nodeSetSize < maxNodes)
+               {
+                       int increment = nodeSetSize - 1;
+                       if (nodeSetSize + increment >= minNodes)
+                               increment = minNodes - nodeSetSize;
+                       if (nodeSetSize >= minNodes)
+                               increment = 1;
+
+                       findNextPool(results, pool, nodeSetSize, increment, minNodes, minMatches, limitMatchesPerGraph);
+                       nodeSetSize += increment;
+               }
+       }
+
        void clearOverlapHistory()
        {
                for (auto &it : graphData)
@@ -1252,6 +1498,11 @@ void SubCircuit::Solver::solve(std::vector<Result> &results, std::string needleG
        worker->solve(results, needleGraphId, haystackGraphId, initialMappings, allowOverlap, maxSolutions);
 }
 
+void SubCircuit::Solver::mine(std::vector<MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph)
+{
+       worker->mine(results, minNodes, maxNodes, minMatches, limitMatchesPerGraph);
+}
+
 void SubCircuit::Solver::clearOverlapHistory()
 {
        worker->clearOverlapHistory();
index da536ba09b4fbe89a884c7baeb2f9eaa37f586e7..b9399a99da6e37b396a6dccbf3d3473c9107787b 100644 (file)
@@ -73,6 +73,7 @@ namespace SubCircuit
 
        public:
                Graph() : allExtern(false) { };
+               Graph(const Graph &other, const std::vector<std::string> &otherNodes);
 
                void createNode(std::string nodeId, std::string typeId, void *userData = NULL);
                void createPort(std::string nodeId, std::string portId, int width = 1, int minWidth = -1);
@@ -100,6 +101,17 @@ namespace SubCircuit
                        std::map<std::string, ResultNodeMapping> mappings;
                };
 
+               struct MineResultNode {
+                       std::string nodeId;
+                       void *userData;
+               };
+               struct MineResult {
+                       std::string graphId;
+                       int totalMatchesAfterLimits;
+                       std::map<std::string, int> matchesPerGraph;
+                       std::vector<MineResultNode> nodes;
+               };
+
        private:
                SolverWorker *worker;
        
@@ -131,6 +143,9 @@ namespace SubCircuit
                void solve(std::vector<Result> &results, std::string needleGraphId, std::string haystackGraphId, bool allowOverlap = true, int maxSolutions = -1);
                void solve(std::vector<Result> &results, std::string needleGraphId, std::string haystackGraphId,
                                const std::map<std::string, std::set<std::string>> &initialMapping, bool allowOverlap = true, int maxSolutions = -1);
+
+               void mine(std::vector<MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph = -1);
+
                void clearOverlapHistory();
                void clearConfig();
        };
diff --git a/libs/subcircuit/test_mine.txt b/libs/subcircuit/test_mine.txt
new file mode 100644 (file)
index 0000000..e3b9170
--- /dev/null
@@ -0,0 +1,35 @@
+
+# verbose
+
+graph macc22
+       node mul_1 mul A 32 B 32 Y 32
+       node mul_2 mul A 32 B 32 Y 32
+       node add_1 add A 32 B 32 Y 32
+       connect mul_1 Y add_1 A
+       connect mul_2 Y add_1 B
+       allextern
+endgraph
+
+graph macc4x2
+       node mul_1 mul A 32 B 32 Y 32
+       node mul_2 mul A 32 B 32 Y 32
+       node mul_3 mul A 32 B 32 Y 32
+       node mul_4 mul A 32 B 32 Y 32
+       node add_1 add A 32 B 32 Y 32
+       node add_2 add A 32 B 32 Y 32
+       node add_3 add A 32 B 32 Y 32
+       connect mul_1 Y add_1 A
+       connect mul_2 Y add_1 B
+       connect mul_3 Y add_2 A
+       connect mul_4 Y add_2 B
+       connect add_1 Y add_3 A
+       connect add_2 Y add_3 B
+       allextern
+endgraph
+
+swapgroup mul A B
+swapgroup add A B
+
+mine 2 10 2
+expect 5
+