com.cloudera.sa.giraph.examples.ktrusses.KTrussVertex.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.sa.giraph.examples.ktrusses.KTrussVertex.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.sa.giraph.examples.ktrusses;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.giraph.edge.Edge;
import org.apache.giraph.edge.MutableEdge;
import org.apache.giraph.graph.Vertex;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;

/*
 * This algorithm finds trusses in a graph.
 * 
 * Trusses are found using a series of algorithm phases: finding the k-core, to deactivate edges that can never be part of a truss;
 * finding the active degree of each node and putting that on the edges; finding triangles by intelligently querying to find closing
 * edges of open triads; finding the truss support of each edge and repeating everything after k-core until no edges are deactivated.
 * Once the trusses are found
 * 
 *  finds trusses. Once the trusses are found, 
 * a final Componentisation step is used to label each truss with a unique name for visualisation.
 */
public class KTrussVertex extends Vertex<LongWritable, VertexState, EdgeState, Message> {

    private static final LongWritable ONE = new LongWritable(1);

    /*
     * This method is used to compare two nodes by degree and name, in order to determine the
     * most efficient way to find triangles.
     */
    private boolean ordering(int degreeA, long idA, int degreeB, long idB) {
        if (degreeA < degreeB) {
            return true;
        }
        if (degreeA == degreeB) {
            if (idA < idB) {
                return true;
            }
        }
        return false;
    }

    /*
     * This returns the number of active edges this node has.
     */
    private int getActiveDegree() {
        int degree = 0;
        for (Edge<LongWritable, EdgeState> edge : getEdges()) {
            if (edge.getValue().isActive()) {
                degree++;
            }
        }
        return degree;
    }

    /*
     * This is a utility method to send messages to active nodes.
     */
    private void sendMessageToAllActiveNodes(Message message) {
        for (Edge<LongWritable, EdgeState> edge : getEdges()) {
            if (edge.getValue().isActive()) {
                sendMessage(edge.getTargetVertexId(), message);
            }
        }
    }

    /*
     * This a utility method to evaluate a triangle message and use that to increment the support on each edge
     * that is a part of the triangle.
     */
    private void incrementSupport(LongWritable nodeID) {
        if (!nodeID.equals(getId())) {
            EdgeState ev = getEdgeValue(nodeID);
            ev.setSupport(ev.getSupport() + 1);
            setEdgeValue(nodeID, ev);
        }
    }

    @Override
    public void compute(Iterable<Message> messages) throws IOException {

        // Retreive the current phase of processing which is set by the MasterCompute
        PhaseWritable writablePhase = getAggregatedValue(Constants.PHASE);
        Phase phase = writablePhase.get();

        int k = getContext().getConfiguration().getInt(Constants.K, 3);

        /*
         * This phase reduces the graph to k-1 core, which reduces the amount of work to find trusses.
         */
        if (phase == Phase.KCORE) {

            // Deactivate edges that can't be part of a truss because their degree is too low
            int degree = getActiveDegree();
            if (degree < k - 1) {
                for (Edge<LongWritable, EdgeState> edge : getEdges()) {
                    if (edge.getValue().isActive()) {
                        EdgeState ev = getEdgeValue(edge.getTargetVertexId());
                        ev.setActive(false);
                        setEdgeValue(edge.getTargetVertexId(), ev);
                        aggregate(Constants.UPDATES, ONE);
                    }
                }
            }
        }

        /*
         * This phase determines the active degree of a node (the number of nodes I connect to
         * after not considering edges which can't be part of a truss). The degree is used to make triangle
         * finding more efficient in later phases.
         */
        if (phase == Phase.DEGREE) {
            // Determine active degree, store it in the node state and send it to all active neighbours
            int degree = getActiveDegree();
            setValue(new VertexState(degree, -1, false));
            sendMessageToAllActiveNodes(new Message(getId(), new IntWritable(degree)));
        }

        /*
         * This phase starts the process of finding triangles. Triangles could be found by any of the 
         * three nodes, so to ensure we only find a triangle once, we pick one of the nodes to do the work.
         * Since we can pick which node does the work, we pick the node with lowest degree (fewest links),
         * which is efficient. If two nodes have the same degree, we pick the node with the lowest name. 
         * 
         * Triangles are found by asking one of the other nodes whether it connects to a third node. This
         * is done with a double-for-loop over the edges, considering only active edges. Again, we send the
         * query to the node that isn't the highest degree.
         */

        if (phase == Phase.QUERY_FOR_CLOSING_EDGES) {

            //First, we need to process the degree information and store it on our edges.
            for (Message message : messages) {
                EdgeState edgeValue = getEdgeValue(message.getSource());
                if (edgeValue == null) {
                    edgeValue = new EdgeState();
                }
                int targetDegree = message.getDegree().get();
                edgeValue.setTargetDegree(targetDegree);
                setEdgeValue(message.getSource(), edgeValue);
            }

            // Now we find the possible triangles and send a query message to each.
            int myDegree = getValue().getDegree();
            if (myDegree > 1) {
                for (Edge<LongWritable, EdgeState> neighbourA : getEdges()) {
                    if (neighbourA.getValue().isActive()) {
                        int neighbourADegree = neighbourA.getValue().getTargetDegree();
                        if (ordering(myDegree, getId().get(), neighbourADegree,
                                neighbourA.getTargetVertexId().get())) {
                            for (Edge<LongWritable, EdgeState> neighbourB : getEdges()) {
                                if (neighbourB.getValue().isActive()) {
                                    int neighbourBDegree = neighbourB.getValue().getTargetDegree();
                                    if (ordering(neighbourADegree, neighbourA.getTargetVertexId().get(),
                                            neighbourBDegree, neighbourB.getTargetVertexId().get())) {
                                        sendMessage(neighbourA.getTargetVertexId(),
                                                new Message(getId(), neighbourB.getTargetVertexId()));
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        /*
         * In this phase, we process the queries about closing edges and find triangles if the are present. We then
         * send details of the triangle (which is only found by one node) to all nodes in that triangle.
         */
        if (phase == Phase.FIND_TRIANGLES) {

            // Cache my active neighbour's names
            Set<Long> edges = new HashSet<Long>();
            for (MutableEdge<LongWritable, EdgeState> edge : getMutableEdges()) {
                if (edge.getValue().isActive()) {
                    edges.add(edge.getTargetVertexId().get());
                }
            }

            // Find triangles and report them to participating nodes. 
            for (Message message : messages) {
                if (edges.contains(message.getTriadA().get()) && edges.contains(message.getTriadB().get())) {
                    Message triangle = new Message(getId(), message.getTriadA(), message.getTriadB());
                    sendMessage(getId(), triangle);
                    sendMessage(message.getTriadA(), triangle);
                    sendMessage(message.getTriadB(), triangle);
                }
            }
        }

        /*
         * In this phase, the node processes the triangle reports and uses that to build a picture of how
         * many triangles use each edge - this is known as the "support". We then use that to determine which edges
         * are part of a truss by having enough support, and deactivating those that don't. Once this phase is complete,
         * we evaluate (in the MasterCompute) whether or not the graph changed. If it did we iterate again, starting by
         * determining the active degree, otherwise we continue.
         * 
         */
        if (phase == Phase.TRUSSES) {

            // Clear out support values from previous iterations
            for (Edge<LongWritable, EdgeState> edge : getEdges()) {
                EdgeState ev = getEdgeValue(edge.getTargetVertexId());
                ev.setSupport(0);
                setEdgeValue(edge.getTargetVertexId(), ev);
            }

            // Build a picture of the support of each edge
            for (Message message : messages) {
                incrementSupport(message.getTriangleA());
                incrementSupport(message.getTriangleB());
                incrementSupport(message.getTriangleC());
            }

            /*
             * Go through our edges, marking any that don't have enough support as inactive. We also track how
             * many edges we deactivated.
             */
            for (Edge<LongWritable, EdgeState> edge : getEdges()) {
                if (edge.getValue().isActive()) {
                    if (edge.getValue().getSupport() < k - 2) {
                        EdgeState ev = getEdgeValue(edge.getTargetVertexId());
                        ev.setActive(false);
                        ev.setSupport(0);
                        setEdgeValue(edge.getTargetVertexId(), ev);
                    }
                }
            }
        }

        /*
         * This is the first componentisation phase. At this point, we know our neighbour's trussID
         * because it is initialised to be the same as their ID, so we use that to determine the lowest
         * local truss ID. Once that is found, we broadcast it in order to let the value propagate.
         */
        if (phase == Phase.COMPONENTISATION_1) {

            boolean updated = false;
            boolean isInATruss = false;

            // Determine whether the current node is in a truss or not.
            for (Edge<LongWritable, EdgeState> edge : getEdges()) {
                if (edge.getValue().isActive()) {
                    isInATruss = true;
                    break;
                }
            }

            if (isInATruss) {
                // Determine lowest Truss ID
                long lowestId = getId().get();
                for (Edge<LongWritable, EdgeState> edge : getEdges()) {
                    if (edge.getValue().isActive()) {
                        lowestId = Math.min(lowestId, edge.getTargetVertexId().get());
                    }
                }
                if (lowestId <= getId().get()) {
                    getValue().setTrussID(lowestId);
                    getValue().setInATruss(true);
                    updated = true;
                }

                // Broadcast the lowest truss ID if we were updated with it
                if (updated) {
                    sendMessageToAllActiveNodes(new Message(new LongWritable(getValue().getTrussID())));
                }
            }
        }

        /*
         * This is the second phase of the componentisation, in which the lowest truss ID received is propagated.
         * If the truss ID is lowered, we again rebroadcast it and stay in this phase until there are no more updates.
         */
        if (phase == Phase.COMPONENTISATION_2) {
            // Determine lowest Truss ID
            long lowestValue = getValue().getTrussID();
            for (Message message : messages) {
                lowestValue = Math.min(lowestValue, message.getTrussID().get());
            }
            boolean updated = false;
            if (lowestValue < getValue().getTrussID()) {
                getValue().setTrussID(lowestValue);
                aggregate(Constants.UPDATES, ONE);
                updated = true;

            }

            // Broadcast the lowest truss ID if we were updated with it
            if (updated) {
                sendMessageToAllActiveNodes(new Message(new LongWritable(getValue().getTrussID())));
            }
        }

        /*
         * And we're done :)
         */
        if (phase == Phase.OUTPUT) {
            voteToHalt();
        }
    }

}