Java tutorial
/* * This file is part of Gradoop. * * Gradoop is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Gradoop is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Gradoop. If not, see <http://www.gnu.org/licenses/>. */ package org.gradoop.model.impl.algorithms.fsm.gspan; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.flink.hadoop.shaded.com.google.common.collect.Maps; import org.gradoop.model.impl.algorithms.fsm.config.FSMConfig; import org.gradoop.model.impl.algorithms.fsm.gspan.comparators.DFSCodeSiblingComparator; import org.gradoop.model.impl.algorithms.fsm.gspan.encoders.tuples.EdgeTriple; import org.gradoop.model.impl.algorithms.fsm.gspan.pojos.AdjacencyList; import org.gradoop.model.impl.algorithms.fsm.gspan.pojos.AdjacencyListEntry; import org.gradoop.model.impl.algorithms.fsm.gspan.pojos.DFSCode; import org.gradoop.model.impl.algorithms.fsm.gspan.pojos.DFSEmbedding; import org.gradoop.model.impl.algorithms.fsm.gspan.pojos.DFSStep; import org.gradoop.model.impl.algorithms.fsm.gspan.pojos.GSpanEdge; import org.gradoop.model.impl.algorithms.fsm.gspan.pojos.GSpanGraph; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; /** * Encapsulation of all logic of the gSpan algorithm for mining frequent * subgraphs in a collection of graphs. File is separated into methods for * graph construction, pattern growth, minimal DFS code validation and pattern * matching */ public class GSpan { // GRAPH CONSTRUCTION /** * Creates the gSpan mining representation of a graph transaction from a * given collection of Gradoop edge triples. * * @param triples the graphs edges * @param <T> edge triple type * @param <IDT> id type * @return graph transaction */ public static <T extends EdgeTriple<IDT>, IDT> GSpanGraph createGSpanGraph(Iterable<T> triples) { // replace GradoopIds by Integer Ids List<GSpanEdge> edges = Lists.newArrayList(); List<AdjacencyList> adjacencyLists = Lists.newArrayList(); createAdjacencyListsAndEdges(triples, adjacencyLists, edges); return createGSpanGraph(adjacencyLists, edges); } /** * Creates the gSpan mining representation of a graph transaction from an * existing encoded subgraph. * * @param subgraph encodes subgraph * @return graph transaction */ private static GSpanGraph createGSpanGraph(DFSCode subgraph) { // turn DFS edges into gSpan edges List<DFSStep> steps = subgraph.getSteps(); List<AdjacencyList> adjacencyLists = Lists.newArrayList(); List<GSpanEdge> edges = Lists.newArrayListWithExpectedSize(steps.size()); createAdjacencyListsAndEdges(steps, adjacencyLists, edges); return createGSpanGraph(adjacencyLists, edges); } /** * Creates the gSpan mining representation of a graph transaction from a * give list of adjacency lists and edges. * * @param adjacencyLists adjacency lists * @param edges edges * @return graph transaction */ private static GSpanGraph createGSpanGraph(List<AdjacencyList> adjacencyLists, List<GSpanEdge> edges) { // sort by min vertex label Collections.sort(edges); // create adjacency lists and 1-edge subgraphs with their embeddings Map<DFSCode, Collection<DFSEmbedding>> codeEmbeddings = Maps.newHashMap(); Iterator<GSpanEdge> iterator = edges.iterator(); GSpanEdge lastEdge = iterator.next(); Collection<DFSEmbedding> embeddings = createSingleEdgeSubgraphEmbeddings(codeEmbeddings, lastEdge); while (iterator.hasNext()) { GSpanEdge edge = iterator.next(); // add embedding of 1-edge code if (edge.compareTo(lastEdge) == 0) { embeddings.add(createSingleEdgeEmbedding(edge)); } else { embeddings = createSingleEdgeSubgraphEmbeddings(codeEmbeddings, edge); lastEdge = edge; } } return new GSpanGraph(adjacencyLists, codeEmbeddings); } /** * turns edge triples into gSpan edges, * i.e., replaces GradoopIds by local integer ids * * @param iterable edge triples with GradoopIds * @param adjacencyLists adjacency lists * @param edges @return gSpan edges * @param <T> edge triple type * @param <IDT> id type */ private static <T extends EdgeTriple<IDT>, IDT> void createAdjacencyListsAndEdges(final Iterable<T> iterable, final List<AdjacencyList> adjacencyLists, final List<GSpanEdge> edges) { Map<IDT, Integer> vertexIdMap = Maps.newHashMap(); int vertexId = 0; int edgeId = 0; for (EdgeTriple<IDT> triple : iterable) { Integer edgeLabel = triple.getEdgeLabel(); IDT sourceGradoopId = triple.getSourceId(); Integer sourceId = vertexIdMap.get(sourceGradoopId); Integer sourceLabel = triple.getSourceLabel(); if (sourceId == null) { sourceId = vertexId; vertexIdMap.put(sourceGradoopId, sourceId); vertexId++; } IDT targetGradoopId = triple.getTargetId(); Integer targetId = vertexIdMap.get(targetGradoopId); Integer targetLabel = triple.getTargetLabel(); if (targetId == null) { targetId = vertexId; vertexIdMap.put(targetGradoopId, targetId); vertexId++; } addNewEdgeAndAdjacencyListEntries(edges, adjacencyLists, sourceId, sourceLabel, edgeId, edgeLabel, targetId, targetLabel); edgeId++; } } /** * Creates a new gSpan edge and two adjacency list entries for given labels * and identifiers. Edge will be added to an edge lists and adjacency list * entries to an adjacency list. * * @param edges edges * @param adjacencyLists adjacency lists * @param sourceId source vertex id * @param sourceLabel source vertex label * @param edgeId edge id * @param edgeLabel edge label * @param targetId target id * @param targetLabel target label */ private static void addNewEdgeAndAdjacencyListEntries(final List<GSpanEdge> edges, final List<AdjacencyList> adjacencyLists, int sourceId, int sourceLabel, int edgeId, int edgeLabel, int targetId, int targetLabel) { edges.add(new GSpanEdge(sourceId, sourceLabel, edgeId, edgeLabel, targetId, targetLabel)); if (sourceId <= targetId) { addAdjacencyListEntry(adjacencyLists, sourceId, sourceLabel, true, edgeId, edgeLabel, targetId, targetLabel); addAdjacencyListEntry(adjacencyLists, targetId, targetLabel, false, edgeId, edgeLabel, sourceId, sourceLabel); } else { addAdjacencyListEntry(adjacencyLists, targetId, targetLabel, false, edgeId, edgeLabel, sourceId, sourceLabel); addAdjacencyListEntry(adjacencyLists, sourceId, sourceLabel, true, edgeId, edgeLabel, targetId, targetLabel); } } /** * Creates a new adjacency list entry and adds it to its adjacency list. * * @param adjacencyLists adjacency lists * @param fromId entry owning vertex id * @param fromLabel entry owning vertex label * @param outgoing true, if outgoing entry, false, if incoming * @param edgeId edge id * @param edgeLabel edge label * @param toId referenced vertex id * @param toLabel reference vertex label */ private static void addAdjacencyListEntry(final List<AdjacencyList> adjacencyLists, int fromId, int fromLabel, boolean outgoing, int edgeId, int edgeLabel, int toId, int toLabel) { AdjacencyList adjacencyList; if (fromId >= adjacencyLists.size()) { adjacencyList = new AdjacencyList(fromLabel); adjacencyLists.add(adjacencyList); } else { adjacencyList = adjacencyLists.get(fromId); } adjacencyList.getEntries().add(new AdjacencyListEntry(outgoing, edgeId, edgeLabel, toId, toLabel)); } /** * Creates adjacency lists and gSpan edges for a list of DFS steps. * * @param steps DFS steps * @param adjacencyLists adjacency lists * @param edges edges */ private static void createAdjacencyListsAndEdges(final List<DFSStep> steps, final List<AdjacencyList> adjacencyLists, final List<GSpanEdge> edges) { int edgeId = 0; for (DFSStep step : steps) { Integer edgeLabel = step.getEdgeLabel(); Integer sourceId; Integer sourceLabel; Integer targetId; Integer targetLabel; if (step.isOutgoing()) { sourceId = step.getFromTime(); sourceLabel = step.getFromLabel(); targetId = step.getToTime(); targetLabel = step.getToLabel(); } else { sourceId = step.getToTime(); sourceLabel = step.getToLabel(); targetId = step.getFromTime(); targetLabel = step.getFromLabel(); } addNewEdgeAndAdjacencyListEntries(edges, adjacencyLists, sourceId, sourceLabel, edgeId, edgeLabel, targetId, targetLabel); edgeId++; } } /** * Creates a new entry for single edge subgraph including an initial * embedding for a gibe edge. * * @param subgraphEmbeddings subgraph-embeddings map * @param edge edge * @return collection of embeddings */ private static Collection<DFSEmbedding> createSingleEdgeSubgraphEmbeddings( Map<DFSCode, Collection<DFSEmbedding>> subgraphEmbeddings, GSpanEdge edge) { DFSCode subgraph = createSingleEdgeSubgraph(edge); DFSEmbedding embedding = createSingleEdgeEmbedding(edge); Collection<DFSEmbedding> embeddings = Lists.newArrayList(embedding); subgraphEmbeddings.put(subgraph, embeddings); return embeddings; } /** * Creates a single edge embedding for a given edge. * * @param edge edge * @return embedding */ private static DFSEmbedding createSingleEdgeEmbedding(final GSpanEdge edge) { List<Integer> vertexTimes; if (edge.isLoop()) { vertexTimes = Lists.newArrayList(edge.getSourceId()); } else if (edge.getSourceLabel() <= edge.getTargetLabel()) { vertexTimes = Lists.newArrayList(edge.getSourceId(), edge.getTargetId()); } else { vertexTimes = Lists.newArrayList(edge.getTargetId(), edge.getSourceId()); } return new DFSEmbedding(vertexTimes, Lists.newArrayList(edge.getEdgeId())); } /** * Create a single edge Subgraph for a given edge. * * @param edge edge * @return subgraph */ private static DFSCode createSingleEdgeSubgraph(final GSpanEdge edge) { int sourceLabel = edge.getSourceLabel(); int edgeLabel = edge.getLabel(); int targetLabel = edge.getTargetLabel(); DFSStep step; if (edge.isLoop()) { step = new DFSStep(0, sourceLabel, true, edgeLabel, 0, sourceLabel); } else if (edge.sourceIsMinimumLabel()) { step = new DFSStep(0, sourceLabel, true, edgeLabel, 1, targetLabel); } else { step = new DFSStep(0, targetLabel, false, edgeLabel, 1, sourceLabel); } return new DFSCode(step); } // PATTERN GROWTH /** * Core of gSpan pattern growth. * Grows all children of supported frequent subgraphs. * * @param graph graph * @param frequentParentSubgraphs frequent subgraphs * @param fsmConfig FSM configuration */ public static void growEmbeddings(final GSpanGraph graph, Collection<DFSCode> frequentParentSubgraphs, FSMConfig fsmConfig) { Map<DFSCode, Collection<DFSEmbedding>> childCodeEmbeddings = null; for (DFSCode parentSubgraph : frequentParentSubgraphs) { Collection<DFSEmbedding> parentEmbeddings = graph.getSubgraphEmbeddings().get(parentSubgraph); if (parentEmbeddings != null) { int minVertexLabel = parentSubgraph.getMinVertexLabel(); List<Integer> rightmostPath = parentSubgraph.getRightMostPathVertexTimes(); // for each embedding for (DFSEmbedding parentEmbedding : parentEmbeddings) { // first iterated vertex is rightmost Boolean rightMostVertex = true; Set<Integer> visitedEdges = Sets.newHashSet(parentEmbedding.getEdgeTimes()); Map<Integer, Integer> vertexIdTime = createVertexTimeIndex(parentEmbedding); // for each time on rightmost path for (Integer fromVertexTime : rightmostPath) { Integer fromVertexId = parentEmbedding.getVertexTimes().get(fromVertexTime); // query fromVertex data AdjacencyList adjacencyList = graph.getAdjacencyLists().get(fromVertexId); Integer fromVertexLabel = adjacencyList.getFromVertexLabel(); // for each incident edge for (AdjacencyListEntry entry : adjacencyList.getEntries()) { // if valid extension for branch int toVertexLabel = entry.getToVertexLabel(); if (toVertexLabel >= minVertexLabel) { Integer edgeId = entry.getEdgeId(); // if edge not already contained if (!visitedEdges.contains(edgeId)) { // query toVertexData Integer toVertexId = entry.getToVertexId(); Integer toVertexTime = vertexIdTime.get(toVertexId); boolean forward = toVertexTime == null; // PRUNING : grow only forward // or backward from rightmost vertex if (forward || rightMostVertex) { DFSEmbedding childEmbedding = DFSEmbedding.deepCopy(parentEmbedding); DFSCode childCode = DFSCode.deepCopy(parentSubgraph); // add new vertex to embedding for forward steps if (forward) { childEmbedding.getVertexTimes().add(toVertexId); toVertexTime = vertexIdTime.size(); } childCode.getSteps() .add(new DFSStep(fromVertexTime, fromVertexLabel, entry.isOutgoing(), entry.getEdgeLabel(), toVertexTime, toVertexLabel)); childEmbedding.getEdgeTimes().add(edgeId); childCodeEmbeddings = addCodeEmbedding(childCodeEmbeddings, childCode, childEmbedding); } } } } rightMostVertex = false; } } } } graph.setSubgraphEmbeddings(childCodeEmbeddings); } /** * Creates an index of the vertex time of each mapped vertex id. * * @param embedding embedding containing the time-vertex mapping * @return vertex-time mapping */ private static Map<Integer, Integer> createVertexTimeIndex(DFSEmbedding embedding) { Map<Integer, Integer> vertexTimes = Maps.newHashMapWithExpectedSize(embedding.getVertexTimes().size()); int vertexTime = 0; for (int vertexId : embedding.getVertexTimes()) { vertexTimes.put(vertexId, vertexTime); vertexTime++; } return vertexTimes; } /** * Returns a new comparator for subgraph siblings depending on the given * configuration. * @param fsmConfig FSM configuration * @return comparator */ private static DFSCodeSiblingComparator getSiblingComparator(FSMConfig fsmConfig) { return new DFSCodeSiblingComparator(fsmConfig.isDirected()); } /** * Adds a subgraph and one of its embeddings to a map of subgraphs and * their embeddings. Creates a new map, if null. Creates a new entry, if DFS * code not already contained. * * @param subgraphEmbeddings subgraph - embeddings - map * @param subgraph subgraph * @param embedding embedding * @return updated input map */ private static Map<DFSCode, Collection<DFSEmbedding>> addCodeEmbedding( Map<DFSCode, Collection<DFSEmbedding>> subgraphEmbeddings, DFSCode subgraph, DFSEmbedding embedding) { Collection<DFSEmbedding> embeddings; if (subgraphEmbeddings == null) { subgraphEmbeddings = Maps.newHashMap(); subgraphEmbeddings.put(subgraph, Lists.newArrayList(embedding)); } else { embeddings = subgraphEmbeddings.get(subgraph); if (embeddings == null) { subgraphEmbeddings.put(subgraph, Lists.newArrayList(embedding)); } else { embeddings.add(embedding); } } return subgraphEmbeddings; } // MINIMAL DFS CODE VALIDATION /** * Checks, if a subgraph is minimal. * * @param subgraph subgraph * @param fsmConfig FSM configuration * @return true, if minimal, false otherwise */ public static boolean isMinimal(DFSCode subgraph, FSMConfig fsmConfig) { GSpanGraph graph = createGSpanGraph(subgraph); DFSCode minDfsCode = calculateMinDFSCode(graph, fsmConfig); return subgraph.equals(minDfsCode); } /** * Determines the minimal subgraph of a given graph. * * @param graph graph * @param fsmConfig FSM configuration * @return minimal subgraph */ public static DFSCode calculateMinDFSCode(GSpanGraph graph, FSMConfig fsmConfig) { DFSCode minDfsCode = null; while (graph.hasGrownSubgraphs()) { Set<DFSCode> grownSubgraphs = graph.getSubgraphEmbeddings().keySet(); minDfsCode = selectMinDFSCode(grownSubgraphs, fsmConfig); growEmbeddings(graph, Lists.newArrayList(minDfsCode), fsmConfig); } return minDfsCode; } /** * Selectis the minimal subgraph from a given collection. * * @param subgraphs collection of subgraphs * @param fsmConfig FSM configuration * @return minimal subgraph */ public static DFSCode selectMinDFSCode(final Collection<DFSCode> subgraphs, final FSMConfig fsmConfig) { Iterator<DFSCode> iterator = subgraphs.iterator(); DFSCode minDfsCode = iterator.next(); for (DFSCode nextDfsCode : subgraphs) { if (getSiblingComparator(fsmConfig).compare(nextDfsCode, minDfsCode) < 0) { minDfsCode = nextDfsCode; } } return minDfsCode; } // PATTERN MATCHING /** * Checks, if a graph contains a subgraph (pattern matching). * @param graph search graph * @param subgraph subgraph * @param fsmConfig FSM configuration * @return true, if subgraph is contained, false, otherwise */ public static boolean contains(GSpanGraph graph, DFSCode subgraph, FSMConfig fsmConfig) { Iterator<DFSStep> iterator = subgraph.getSteps().iterator(); DFSStep step = iterator.next(); Collection<DFSEmbedding> parentEmbeddings = Lists.newArrayList(); int fromVertexId = 0; for (AdjacencyList adjacencyList : graph.getAdjacencyLists()) { if (step.getFromLabel().equals(adjacencyList.getFromVertexLabel())) { for (AdjacencyListEntry entry : adjacencyList.getEntries()) { if (step.isOutgoing() == entry.isOutgoing() && step.getEdgeLabel().equals(entry.getEdgeLabel()) && step.getToLabel().equals(entry.getToVertexLabel())) { List<Integer> vertexTimes = step.isLoop() ? Lists.newArrayList(fromVertexId) : Lists.newArrayList(fromVertexId, entry.getToVertexId()); List<Integer> edgesTimes = Lists.newArrayList(entry.getEdgeId()); parentEmbeddings.add(new DFSEmbedding(vertexTimes, edgesTimes)); } } } fromVertexId++; } while (iterator.hasNext() && !parentEmbeddings.isEmpty()) { Collection<DFSEmbedding> childEmbeddings = Lists.newArrayList(); step = iterator.next(); for (DFSEmbedding parentEmbedding : parentEmbeddings) { fromVertexId = parentEmbedding.getVertexTimes().get(step.getFromTime()); for (AdjacencyListEntry entry : graph.getAdjacencyLists().get(fromVertexId).getEntries()) { // edge not contained if (!parentEmbedding.getEdgeTimes().contains(entry.getEdgeId())) { // forward traversal if (step.isForward() && // same to vertex label step.getToLabel().equals(entry.getToVertexLabel()) && // same edge label step.getEdgeLabel().equals(entry.getEdgeLabel()) && // vertex not contained !parentEmbedding.getVertexTimes().contains(entry.getToVertexId())) { DFSEmbedding childEmbedding = DFSEmbedding.deepCopy(parentEmbedding); childEmbedding.getVertexTimes().add(entry.getToVertexId()); childEmbedding.getEdgeTimes().add(entry.getEdgeId()); childEmbeddings.add(childEmbedding); // backward traversal } else if (step.isBackward() && // same edge label step.getEdgeLabel().equals(entry.getEdgeLabel()) && // vertex already mapped to correct time parentEmbedding.getVertexTimes().get(step.getToTime()) .equals(entry.getToVertexId())) { DFSEmbedding childEmbedding = DFSEmbedding.deepCopy(parentEmbedding); childEmbedding.getEdgeTimes().add(entry.getEdgeId()); childEmbeddings.add(childEmbedding); } } } } parentEmbeddings = childEmbeddings; } return !parentEmbeddings.isEmpty(); } }