This is a library that implements a modified Ullmann Subgraph Isomorphism
Algorithm with additional features aimed at working with coarse grain logic
-networks.
+networks. It also contains a simple frequent subcircuit mining algorithm.
A simple command line tool that exposes the features of the library is also
included.
-Under-Construction Warning
---------------------------
-
-This work is under constructions. It is likely that they are bugs in the
-library that need fixing. Feel free to contact me at clifford@clifford.at
-if you have found a bug.
-
-
C++11 Warning
-------------
* Support for finding only non-overlapping matches.
+ * A simple miner for frequent subcircuts that operates on the same circuit
+ description format.
+
* The public API of the library is using std::string identifiers for
nodes, node types and ports. Internally the costly part of the
algorithm is only using integer values, thus speeding up the
ignored. The default implementation always returns true.
+Mining for frequent SubCircuits
+-------------------------------
+
+The solver also contains a miner for frequent subcircuits. The following code
+fragment will find all frequent subcircuits with at least minNodes nodes and
+at most maxNodes nodes that occurs at least minMatches times:
+
+ std::vector<SubCircuit::Solver::MineResult> results;
+ mySolver.mine(results, minNodes, maxNodes, minMatches);
+
+The miner works by finding frequent pairs of nodes and then combining them
+to larger subcircuits. Because of this incremental strategy the miner only
+works as expected on graphs with markAllExtern() set.
+
+The mine() method has an optional fifth parameter that limits the number
+of matches counted in one graph. This can be useful when mining for circuits
+that are found in at least a number of graphs. E.g. the following call
+would find all subcircuits with 5 nodes that are found in at least 7 of
+the registered graphs:
+
+ mySolver.mine(results, 5, 5, 7, 1);
+
+Note that this miner is not very efficient and therefore its use is not
+recommended for large circuits.
+
+
Debugging
---------
Call Solver::solve(). The <allow_overlap> must be "1" or "true"
for true and "0" or "false" for false.
+ mine <min_nodes> <max_nodes> <min_matches> [<limit_matches_per_graph>]
+
+ Call Solver::mine().
+
expect <number>
Print all results so far since the last call to expect. Expect
SubCircuit::Solver solver;
std::map<std::string, std::set<std::string>> initialMappings;
std::vector<SubCircuit::Solver::Result> results;
+ std::vector<SubCircuit::Solver::MineResult> mineResults;
std::vector<std::string> cmdBuffer;
bool lastCommandExpect = false;
continue;
}
+ if (cmdBuffer[0] == "mine" && 4 <= cmdBuffer.size() && cmdBuffer.size() <= 5) {
+ solver.mine(mineResults, atoi(cmdBuffer[1].c_str()), atoi(cmdBuffer[2].c_str()),
+ atoi(cmdBuffer[3].c_str()), cmdBuffer.size() == 5 ? atoi(cmdBuffer[4].c_str()) : -1);
+ continue;
+ }
+
if (cmdBuffer[0] == "clearoverlap" && cmdBuffer.size() == 1) {
solver.clearOverlapHistory();
continue;
if (cmdBuffer[0] == "expect" && cmdBuffer.size() == 2) {
int expected = atoi(cmdBuffer[1].c_str());
- printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()));
+ printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()) + int(mineResults.size()));
for (int i = 0; i < int(results.size()); i++) {
printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str());
for (const auto &it : results[i].mappings) {
printf("\n");
}
}
+ for (auto &result : mineResults) {
+ printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits);
+ printf(" primary match in %s:", result.graphId.c_str());
+ for (auto &node : result.nodes)
+ printf(" %s", node.nodeId.c_str());
+ printf("\n");
+ for (auto &it : result.matchesPerGraph)
+ printf(" matches in %s: %d\n", it.first.c_str(), it.second);
+ }
printf("\n");
- if (expected != int(results.size())) {
- printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()));
+ if (expected != int(results.size()) + int(mineResults.size())) {
+ printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()) + int(mineResults.size()));
printf(" +----------------+\n");
printf(" | \\|/ ____ \\|/ |\n");
printf(" | \"@'/ ,. \\`@\" |\n");
return 1;
}
results.clear();
+ mineResults.clear();
lastCommandExpect = true;
continue;
}
delete graph;
if (!lastCommandExpect) {
- printf("\n-- Got %d --\n", int(results.size()));
+ printf("\n-- Got %d --\n", int(results.size()) + int(mineResults.size()));
for (int i = 0; i < int(results.size()); i++) {
printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str());
for (const auto &it : results[i].mappings) {
printf("\n");
}
}
+ for (auto &result : mineResults) {
+ printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits);
+ printf(" primary match in %s:", result.graphId.c_str());
+ for (auto &node : result.nodes)
+ printf(" %s", node.nodeId.c_str());
+ printf("\n");
+ for (auto &it : result.matchesPerGraph)
+ printf(" matches in %s: %d\n", it.first.c_str(), it.second);
+ }
} else
printf("PASSED.\n");
return string;
}
+SubCircuit::Graph::Graph(const Graph &other, const std::vector<std::string> &otherNodes)
+{
+ allExtern = other.allExtern;
+
+ std::map<int, int> other2this;
+ for (int i = 0; i < int(otherNodes.size()); i++) {
+ assert(other.nodeMap.count(otherNodes[i]) > 0);
+ other2this[other.nodeMap.at(otherNodes[i])] = i;
+ nodeMap[otherNodes[i]] = i;
+ }
+
+ std::map<int, int> edges2this;
+ for (auto &i1 : other2this)
+ for (auto &i2 : other.nodes[i1.first].ports)
+ for (auto &i3 : i2.bits)
+ if (edges2this.count(i3.edgeIdx) == 0)
+ edges2this[i3.edgeIdx] = edges2this.size();
+
+ edges.resize(edges2this.size());
+ for (auto &it : edges2this) {
+ for (auto &bit : other.edges[it.first].portBits)
+ if (other2this.count(bit.nodeIdx) > 0)
+ edges[it.second].portBits.insert(BitRef(other2this[bit.nodeIdx], bit.portIdx, bit.bitIdx));
+ edges[it.second].constValue = other.edges[it.first].constValue;
+ edges[it.second].isExtern = other.edges[it.first].isExtern;
+ }
+
+ nodes.resize(other2this.size());
+ for (auto &it : other2this) {
+ nodes[it.second] = other.nodes[it.first];
+ for (auto &i2 : nodes[it.second].ports)
+ for (auto &i3 : i2.bits)
+ i3.edgeIdx = edges2this.at(i3.edgeIdx);
+ }
+}
+
bool SubCircuit::Graph::BitRef::operator < (const BitRef &other) const
{
if (nodeIdx != other.nodeIdx)
}
}
+ // additional data structes and functions for mining
+
+ struct NodeSet {
+ std::string graphId;
+ std::set<int> nodes;
+ NodeSet(std::string graphId, int node1, int node2) {
+ this->graphId = graphId;
+ nodes.insert(node1);
+ nodes.insert(node2);
+ }
+ NodeSet(std::string graphId, const std::vector<int> &nodes) {
+ this->graphId = graphId;
+ for (int node : nodes)
+ this->nodes.insert(node);
+ }
+ void extend(const NodeSet &other) {
+ assert(this->graphId == other.graphId);
+ for (int node : other.nodes)
+ nodes.insert(node);
+ }
+ int extendCandidate(const NodeSet &other) const {
+ if (graphId != other.graphId)
+ return 0;
+ int newNodes = 0;
+ bool intersect = false;
+ for (int node : other.nodes)
+ if (nodes.count(node) > 0)
+ intersect = true;
+ else
+ newNodes++;
+ return intersect ? newNodes : 0;
+ }
+ bool operator <(const NodeSet &other) const {
+ if (graphId != other.graphId)
+ return graphId < other.graphId;
+ return nodes < other.nodes;
+ }
+ };
+
+ void solveForMining(std::vector<Solver::Result> &results, const GraphData &needle)
+ {
+ bool backupVerbose = verbose;
+ verbose = false;
+
+ for (auto &it : graphData)
+ {
+ GraphData &haystack = it.second;
+ assert(haystack.graph.allExtern);
+
+ std::vector<std::set<int>> enumerationMatrix;
+ std::map<std::string, std::set<std::string>> initialMappings;
+ generateEnumerationMatrix(enumerationMatrix, needle, haystack, initialMappings);
+
+ haystack.usedNodes.resize(haystack.graph.nodes.size());
+ ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, true, -1);
+ }
+
+ verbose = backupVerbose;
+ }
+
+ int testForMining(std::vector<Solver::MineResult> &results, std::set<NodeSet> &usedSets, std::vector<std::set<NodeSet>> &nextPool, NodeSet &testSet,
+ const std::string &graphId, const Graph &graph, int minNodes, int minMatches, int limitMatchesPerGraph)
+ {
+ GraphData needle;
+ std::vector<std::string> needle_nodes;
+ for (int nodeIdx : testSet.nodes)
+ needle_nodes.push_back(graph.nodes[nodeIdx].nodeId);
+ needle.graph = Graph(graph, needle_nodes);
+ diCache.add(needle.graph, needle.adjMatrix, graphId, userSolver);
+
+ std::vector<Solver::Result> ullmannResults;
+ solveForMining(ullmannResults, needle);
+
+ int matches = 0;
+ std::map<std::string, int> matchesPerGraph;
+ std::set<NodeSet> thisNodeSetSet;
+
+ for (auto &it : ullmannResults)
+ {
+ std::vector<int> resultNodes;
+ for (auto &i2 : it.mappings)
+ resultNodes.push_back(graphData[it.haystackGraphId].graph.nodeMap[i2.second.haystackNodeId]);
+ NodeSet resultSet(it.haystackGraphId, resultNodes);
+
+ if (usedSets.count(resultSet) > 0) {
+ assert(thisNodeSetSet.count(resultSet) > 0);
+ continue;
+ }
+ usedSets.insert(resultSet);
+ thisNodeSetSet.insert(resultSet);
+
+ matchesPerGraph[it.haystackGraphId]++;
+ if (limitMatchesPerGraph < 0 || matchesPerGraph[it.haystackGraphId] < limitMatchesPerGraph)
+ matches++;
+ }
+
+ if (matches < minMatches)
+ return 0;
+
+ if (minNodes <= int(testSet.nodes.size()))
+ {
+ Solver::MineResult result;
+ result.graphId = graphId;
+ result.totalMatchesAfterLimits = matches;
+ result.matchesPerGraph = matchesPerGraph;
+ for (int nodeIdx : testSet.nodes) {
+ Solver::MineResultNode resultNode;
+ resultNode.nodeId = graph.nodes[nodeIdx].nodeId;
+ resultNode.userData = graph.nodes[nodeIdx].userData;
+ result.nodes.push_back(resultNode);
+ }
+ results.push_back(result);
+ }
+
+ nextPool.push_back(thisNodeSetSet);
+ return matches;
+ }
+
+ void findNodePairs(std::vector<Solver::MineResult> &results, std::vector<std::set<NodeSet>> &nodePairs, int minNodes, int minMatches, int limitMatchesPerGraph)
+ {
+ std::set<NodeSet> usedPairs;
+
+ if (verbose)
+ printf("\nFind frequent node pairs:\n");
+
+ for (auto &graph_it : graphData)
+ for (int node1 = 0; node1 < int(graph_it.second.graph.nodes.size()); node1++)
+ for (auto &adj_it : graph_it.second.adjMatrix.at(node1))
+ {
+ const std::string &graphId = graph_it.first;
+ const auto &graph = graph_it.second.graph;
+ int node2 = adj_it.first;
+ NodeSet pair(graphId, node1, node2);
+
+ if (usedPairs.count(pair) > 0)
+ continue;
+
+ int matches = testForMining(results, usedPairs, nodePairs, pair, graphId, graph, minNodes, minMatches, limitMatchesPerGraph);
+
+ if (verbose && matches > 0)
+ printf("Pair %s[%s,%s] -> %d\n", graphId.c_str(), graph.nodes[node1].nodeId.c_str(),
+ graph.nodes[node2].nodeId.c_str(), matches);
+ }
+ }
+
+ void findNextPool(std::vector<Solver::MineResult> &results, std::vector<std::set<NodeSet>> &pool,
+ int oldSetSize, int increment, int minNodes, int minMatches, int limitMatchesPerGraph)
+ {
+ std::vector<std::set<NodeSet>> nextPool;
+ std::map<std::string, std::vector<const NodeSet*>> poolPerGraph;
+
+ for (auto &i1 : pool)
+ for (auto &i2 : i1)
+ poolPerGraph[i2.graphId].push_back(&i2);
+
+ if (verbose)
+ printf("\nFind frequent subcircuits of size %d using increment %d:\n", oldSetSize+increment, increment);
+
+ std::set<NodeSet> usedSets;
+ for (auto &it : poolPerGraph)
+ for (int idx1 = 0; idx1 < int(it.second.size()); idx1++)
+ for (int idx2 = idx1; idx2 < int(it.second.size()); idx2++)
+ {
+ if (it.second[idx1]->extendCandidate(*it.second[idx2]) != increment)
+ continue;
+
+ NodeSet mergedSet = *it.second[idx1];
+ mergedSet.extend(*it.second[idx2]);
+
+ if (usedSets.count(mergedSet) > 0)
+ continue;
+
+ const std::string &graphId = it.first;
+ const auto &graph = graphData[it.first].graph;
+
+ int matches = testForMining(results, usedSets, nextPool, mergedSet, graphId, graph, minNodes, minMatches, limitMatchesPerGraph);
+
+ if (verbose) {
+ printf("Set %s[", graphId.c_str());
+ bool first = true;
+ for (int nodeIdx : mergedSet.nodes) {
+ printf("%s%s", first ? "" : ",", graph.nodes[nodeIdx].nodeId.c_str());
+ first = false;
+ }
+ printf("] -> %d\n", matches);
+ }
+ }
+
+ pool.swap(nextPool);
+ }
+
// interface to the public Solver class
protected:
ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, allowOverlap, maxSolutions > 0 ? results.size() + maxSolutions : -1);
}
+ void mine(std::vector<Solver::MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph)
+ {
+ int nodeSetSize = 2;
+ std::vector<std::set<NodeSet>> pool;
+ findNodePairs(results, pool, minNodes, minMatches, limitMatchesPerGraph);
+
+ while (nodeSetSize < maxNodes)
+ {
+ int increment = nodeSetSize - 1;
+ if (nodeSetSize + increment >= minNodes)
+ increment = minNodes - nodeSetSize;
+ if (nodeSetSize >= minNodes)
+ increment = 1;
+
+ findNextPool(results, pool, nodeSetSize, increment, minNodes, minMatches, limitMatchesPerGraph);
+ nodeSetSize += increment;
+ }
+ }
+
void clearOverlapHistory()
{
for (auto &it : graphData)
worker->solve(results, needleGraphId, haystackGraphId, initialMappings, allowOverlap, maxSolutions);
}
+void SubCircuit::Solver::mine(std::vector<MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph)
+{
+ worker->mine(results, minNodes, maxNodes, minMatches, limitMatchesPerGraph);
+}
+
void SubCircuit::Solver::clearOverlapHistory()
{
worker->clearOverlapHistory();