org.apache.nutch.scoring.webgraph.NodeReader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.scoring.webgraph.NodeReader.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.scoring.webgraph;

import java.io.IOException;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;

/**
 * Reads and prints to system out information for a single node from the NodeDb
 * in the WebGraph.
 */
public class NodeReader extends Configured {

    private MapFile.Reader[] nodeReaders;

    public NodeReader() {

    }

    public NodeReader(Configuration conf) {
        super(conf);
    }

    /**
     * Prints the content of the Node represented by the url to system out.
     * 
     * @param webGraphDb
     *          The webgraph from which to get the node.
     * @param url
     *          The url of the node.
     * 
     * @throws IOException
     *           If an error occurs while getting the node.
     */
    public void dumpUrl(Path webGraphDb, String url) throws IOException {

        nodeReaders = MapFileOutputFormat.getReaders(new Path(webGraphDb, WebGraph.NODE_DIR), getConf());

        // open the readers, get the node, print out the info, and close the readers
        Text key = new Text(url);
        Node node = new Node();
        MapFileOutputFormat.getEntry(nodeReaders, new HashPartitioner<>(), key, node);
        System.out.println(url + ":");
        System.out.println("  inlink score: " + node.getInlinkScore());
        System.out.println("  outlink score: " + node.getOutlinkScore());
        System.out.println("  num inlinks: " + node.getNumInlinks());
        System.out.println("  num outlinks: " + node.getNumOutlinks());
        FSUtils.closeReaders(nodeReaders);
    }

    /**
     * Runs the NodeReader tool. The command line arguments must contain a
     * webgraphdb path and a url. The url must match the normalized url that is
     * contained in the NodeDb of the WebGraph.
     */
    public static void main(String[] args) throws Exception {

        Options options = new Options();
        OptionBuilder.withArgName("help");
        OptionBuilder.withDescription("show this help message");
        Option helpOpts = OptionBuilder.create("help");
        options.addOption(helpOpts);

        OptionBuilder.withArgName("webgraphdb");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the webgraphdb to use");
        Option webGraphOpts = OptionBuilder.create("webgraphdb");
        options.addOption(webGraphOpts);

        OptionBuilder.withArgName("url");
        OptionBuilder.hasOptionalArg();
        OptionBuilder.withDescription("the url to dump");
        Option urlOpts = OptionBuilder.create("url");
        options.addOption(urlOpts);

        CommandLineParser parser = new GnuParser();
        try {

            // command line must take a webgraphdb and a url
            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("url")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("WebGraphReader", options);
                return;
            }

            // dump the values to system out and return
            String webGraphDb = line.getOptionValue("webgraphdb");
            String url = line.getOptionValue("url");
            NodeReader reader = new NodeReader(NutchConfiguration.create());
            reader.dumpUrl(new Path(webGraphDb), url);

            return;
        } catch (Exception e) {
            e.printStackTrace();
            return;
        }
    }

}