Example usage for org.apache.hadoop.util StringUtils stringifyException

List of usage examples for org.apache.hadoop.util StringUtils stringifyException

Introduction

In this page you can find the example usage for org.apache.hadoop.util StringUtils stringifyException.

Prototype

public static String stringifyException(Throwable e) 

Source Link

Document

Make a string representation of the exception.

Usage

From source file:org.apache.nutch.host.HostInjectorJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage: HostInjectorJob <host_dir>");
        return -1;
    }// w w w . j  a v a 2  s . co  m
    try {
        boolean success = inject(new Path(args[0]));
        if (!success) {
            LOG.error("HostInjectorJob: failed ");
            return -1;
        }
        LOG.info("HostInjectorJob: finished");
        return -0;
    } catch (Exception e) {
        LOG.error("HostInjectorJob: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:org.apache.nutch.hostdb.ReadHostDb.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println(/*from  w  w  w.j a va 2  s. c  o m*/
                "Usage: ReadHostDb <hostdb> [-get <url>] [<output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]]");
        return -1;
    }

    boolean dumpHomepages = false;
    boolean dumpHostnames = false;
    String expr = null;
    String get = null;

    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-dumpHomepages")) {
            LOG.info("ReadHostDb: dumping homepage URL's");
            dumpHomepages = true;
        }
        if (args[i].equals("-dumpHostnames")) {
            LOG.info("ReadHostDb: dumping hostnames");
            dumpHostnames = true;
        }
        if (args[i].equals("-get")) {
            get = args[i + 1];
            LOG.info("ReadHostDb: get: " + get);
            i++;
        }
        if (args[i].equals("-expr")) {
            expr = args[i + 1];
            LOG.info("ReadHostDb: evaluating expression: " + expr);
            i++;
        }
    }

    try {
        if (get != null) {
            getHostDbRecord(new Path(args[0], "current"), get);
        } else {
            readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
        }
        return 0;
    } catch (Exception e) {
        LOG.error("ReadHostDb: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:org.apache.nutch.hostdb.ResolverThread.java

License:Apache License

/**
 *
 *///from w  w w.ja v a  2 s.c  o m
public void run() {
    // Resolve the host and act appropriatly
    try {
        // Throws an exception if host is not found
        @SuppressWarnings("unused")
        InetAddress inetAddr = InetAddress.getByName(host);

        if (datum.isEmpty()) {
            context.getCounter("UpdateHostDb", "new_known_host").increment(1);
            datum.setLastCheck();
            LOG.info(host + ": new_known_host " + datum);
        } else if (datum.getDnsFailures() > 0) {
            context.getCounter("UpdateHostDb", "rediscovered_host").increment(1);
            datum.setLastCheck();
            datum.setDnsFailures(0l);
            LOG.info(host + ": rediscovered_host " + datum);
        } else {
            context.getCounter("UpdateHostDb", "existing_known_host").increment(1);
            datum.setLastCheck();
            LOG.info(host + ": existing_known_host " + datum);
        }

        // Write the host datum
        context.write(hostText, datum);
    } catch (UnknownHostException e) {
        try {
            // If the counter is empty we'll initialize with date = today and 1 failure
            if (datum.isEmpty()) {
                datum.setLastCheck();
                datum.setDnsFailures(1l);
                context.write(hostText, datum);
                context.getCounter("UpdateHostDb", "new_unknown_host").increment(1);
                LOG.info(host + ": new_unknown_host " + datum);
            } else {
                datum.setLastCheck();
                datum.incDnsFailures();

                // Check if this host should be forgotten
                if (purgeFailedHostsThreshold == -1 || purgeFailedHostsThreshold < datum.getDnsFailures()) {

                    context.write(hostText, datum);
                    context.getCounter("UpdateHostDb", "existing_unknown_host").increment(1);
                    LOG.info(host + ": existing_unknown_host " + datum);
                } else {
                    context.getCounter("UpdateHostDb", "purged_unknown_host").increment(1);
                    LOG.info(host + ": purged_unknown_host " + datum);
                }
            }

            context.getCounter("UpdateHostDb", Long.toString(datum.numFailures()) + "_times_failed")
                    .increment(1);
        } catch (Exception ioe) {
            LOG.warn(StringUtils.stringifyException(ioe));
        }
    } catch (Exception e) {
        LOG.warn(StringUtils.stringifyException(e));
    }

    context.getCounter("UpdateHostDb", "checked_hosts").increment(1);
}

From source file:org.apache.nutch.hostdb.UpdateHostDb.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: UpdateHostDb -hostdb <hostdb> "
                + "[-tophosts <tophosts>] [-crawldb <crawldb>] [-checkAll] [-checkFailed]"
                + " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize]");
        return -1;
    }//from w  ww.  j  a v a 2 s. c o m

    Path hostDb = null;
    Path crawlDb = null;
    Path topHosts = null;

    boolean checkFailed = false;
    boolean checkNew = false;
    boolean checkKnown = false;
    boolean force = false;

    boolean filter = false;
    boolean normalize = false;

    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-hostdb")) {
            hostDb = new Path(args[i + 1]);
            LOG.info("UpdateHostDb: hostdb: " + hostDb);
            i++;
        }
        if (args[i].equals("-crawldb")) {
            crawlDb = new Path(args[i + 1]);
            LOG.info("UpdateHostDb: crawldb: " + crawlDb);
            i++;
        }
        if (args[i].equals("-tophosts")) {
            topHosts = new Path(args[i + 1]);
            LOG.info("UpdateHostDb: tophosts: " + topHosts);
            i++;
        }

        if (args[i].equals("-checkFailed")) {
            LOG.info("UpdateHostDb: checking failed hosts");
            checkFailed = true;
        }
        if (args[i].equals("-checkNew")) {
            LOG.info("UpdateHostDb: checking new hosts");
            checkNew = true;
        }
        if (args[i].equals("-checkKnown")) {
            LOG.info("UpdateHostDb: checking known hosts");
            checkKnown = true;
        }
        if (args[i].equals("-checkAll")) {
            LOG.info("UpdateHostDb: checking all hosts");
            checkFailed = true;
            checkNew = true;
            checkKnown = true;
        }
        if (args[i].equals("-force")) {
            LOG.info("UpdateHostDb: forced check");
            force = true;
        }
        if (args[i].equals("-filter")) {
            LOG.info("UpdateHostDb: filtering enabled");
            filter = true;
        }
        if (args[i].equals("-normalize")) {
            LOG.info("UpdateHostDb: normalizing enabled");
            normalize = true;
        }
    }

    if (hostDb == null) {
        System.err.println("hostDb is mandatory");
        return -1;
    }

    try {
        updateHostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew, checkKnown, force, filter, normalize);

        return 0;
    } catch (Exception e) {
        LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:org.apache.nutch.hostdb.UpdateHostDbReducer.java

License:Apache License

/**
  *//  www  .  j  a va  2s  . com
  */
@Override
public void reduce(Text key, Iterable<NutchWritable> values, Context context)
        throws IOException, InterruptedException {

    Map<String, Map<String, Long>> stringCounts = new HashMap<>();
    Map<String, Float> maximums = new HashMap<>();
    Map<String, Float> sums = new HashMap<>(); // used to calc averages
    Map<String, Long> counts = new HashMap<>(); // used to calc averages
    Map<String, Float> minimums = new HashMap<>();
    Map<String, TDigest> tdigests = new HashMap<String, TDigest>();

    HostDatum hostDatum = new HostDatum();
    float score = 0;

    if (stringFields != null) {
        for (int i = 0; i < stringFields.length; i++) {
            stringCounts.put(stringFields[i], new HashMap<>());
        }
    }

    // Loop through all values until we find a non-empty HostDatum or use
    // an empty if this is a new host for the host db
    for (NutchWritable val : values) {
        final Writable value = val.get(); // unwrap

        // Count crawl datum status's and collect metadata from fields
        if (value instanceof CrawlDatum) {
            CrawlDatum buffer = (CrawlDatum) value;

            // Set the correct status field
            switch (buffer.getStatus()) {
            case CrawlDatum.STATUS_DB_UNFETCHED:
                hostDatum.setUnfetched(hostDatum.getUnfetched() + 1l);
                break;

            case CrawlDatum.STATUS_DB_FETCHED:
                hostDatum.setFetched(hostDatum.getFetched() + 1l);
                break;

            case CrawlDatum.STATUS_DB_GONE:
                hostDatum.setGone(hostDatum.getGone() + 1l);
                break;

            case CrawlDatum.STATUS_DB_REDIR_TEMP:
                hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1l);
                break;

            case CrawlDatum.STATUS_DB_REDIR_PERM:
                hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1l);
                break;

            case CrawlDatum.STATUS_DB_NOTMODIFIED:
                hostDatum.setNotModified(hostDatum.getNotModified() + 1l);
                break;
            }

            // Record connection failures
            if (buffer.getRetriesSinceFetch() != 0) {
                hostDatum.incConnectionFailures();
            }

            // Only gather metadata statistics for proper fetched pages
            if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED
                    || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
                // Deal with the string fields
                if (stringFields != null) {
                    for (int i = 0; i < stringFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
                            // Get it!
                            String metadataValue = null;
                            try {
                                metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
                            } catch (Exception e) {
                                LOG.error("Metadata field " + stringFields[i]
                                        + " is probably not a numeric value");
                            }

                            // Does the value exist?
                            if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
                                // Yes, increment it
                                stringCounts.get(stringFields[i]).put(metadataValue,
                                        stringCounts.get(stringFields[i]).get(metadataValue) + 1l);
                            } else {
                                // Create it!
                                stringCounts.get(stringFields[i]).put(metadataValue, 1l);
                            }
                        }
                    }
                }

                // Deal with the numeric fields
                if (numericFields != null) {
                    for (int i = 0; i < numericFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
                            try {
                                // Get it!
                                Float metadataValue = Float.parseFloat(
                                        buffer.getMetaData().get(numericFieldWritables[i]).toString());

                                // Does the median value exist?
                                if (tdigests.containsKey(numericFields[i])) {
                                    tdigests.get(numericFields[i]).add(metadataValue);
                                } else {
                                    // Create it!
                                    TDigest tdigest = TDigest.createDigest(100);
                                    tdigest.add((double) metadataValue);
                                    tdigests.put(numericFields[i], tdigest);
                                }

                                // Does the minimum value exist?
                                if (minimums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue < minimums.get(numericFields[i])) {
                                        minimums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    minimums.put(numericFields[i], metadataValue);
                                }

                                // Does the maximum value exist?
                                if (maximums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue > maximums.get(numericFields[i])) {
                                        maximums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    maximums.put(numericFields[i], metadataValue);
                                }

                                // Sum it up!
                                if (sums.containsKey(numericFields[i])) {
                                    // Increment
                                    sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
                                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1l);
                                } else {
                                    // Create it!
                                    sums.put(numericFields[i], metadataValue);
                                    counts.put(numericFields[i], 1l);
                                }
                            } catch (Exception e) {
                                LOG.error(e.getMessage() + " when processing values for " + key.toString());
                            }
                        }
                    }
                }
            }
        }

        // 
        else if (value instanceof HostDatum) {
            HostDatum buffer = (HostDatum) value;

            // Check homepage URL
            if (buffer.hasHomepageUrl()) {
                hostDatum.setHomepageUrl(buffer.getHomepageUrl());
            }

            // Check lastCheck timestamp
            if (!buffer.isEmpty()) {
                hostDatum.setLastCheck(buffer.getLastCheck());
            }

            // Check and set DNS failures
            if (buffer.getDnsFailures() > 0) {
                hostDatum.setDnsFailures(buffer.getDnsFailures());
            }

            // Check and set connection failures
            if (buffer.getConnectionFailures() > 0) {
                hostDatum.setConnectionFailures(buffer.getConnectionFailures());
            }

            // Check metadata
            if (!buffer.getMetaData().isEmpty()) {
                hostDatum.setMetaData(buffer.getMetaData());
            }

            // Check and set score (score from Web Graph has precedence)
            if (buffer.getScore() > 0) {
                hostDatum.setScore(buffer.getScore());
            }
        }

        // Check for the score
        else if (value instanceof FloatWritable) {
            FloatWritable buffer = (FloatWritable) value;
            score = buffer.get();
        } else {
            LOG.error("Class {} not handled", value.getClass());
        }
    }

    // Check if score was set from Web Graph
    if (score > 0) {
        hostDatum.setScore(score);
    }

    // Set metadata
    for (Map.Entry<String, Map<String, Long>> entry : stringCounts.entrySet()) {
        for (Map.Entry<String, Long> subEntry : entry.getValue().entrySet()) {
            hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()),
                    new LongWritable(subEntry.getValue()));
        }
    }
    for (Map.Entry<String, Float> entry : maximums.entrySet()) {
        hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
    }
    for (Map.Entry<String, Float> entry : sums.entrySet()) {
        hostDatum.getMetaData().put(new Text("avg." + entry.getKey()),
                new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
    }
    for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
        // Emit all percentiles
        for (int i = 0; i < percentiles.length; i++) {
            hostDatum.getMetaData().put(new Text("pct" + Long.toString(percentiles[i]) + "." + entry.getKey()),
                    new FloatWritable((float) entry.getValue().quantile(0.5)));
        }
    }
    for (Map.Entry<String, Float> entry : minimums.entrySet()) {
        hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
    }

    context.getCounter("UpdateHostDb", "total_hosts").increment(1);

    // See if this record is to be checked
    if (shouldCheck(hostDatum)) {
        // Make an entry
        resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold);

        // Add the entry to the queue (blocking)
        try {
            queue.put(resolverThread);
        } catch (InterruptedException e) {
            LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
        }

        // Do not progress, the datum will be written in the resolver thread
        return;
    } else {
        context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
        LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
    }

    // Write the host datum if it wasn't written by the resolver thread
    context.write(key, hostDatum);
}

From source file:org.apache.nutch.hostdb.UpdateHostDbReducer.java

License:Apache License

/**
  * Shut down all running threads and wait for completion.
  *///ww w .j  a  v  a  2s  . c  o m
@Override
public void cleanup(Context context) {
    LOG.info("UpdateHostDb: feeder finished, waiting for shutdown");

    // If we're here all keys have been fed and we can issue a shut down
    executor.shutdown();

    boolean finished = false;

    // Wait until all resolvers have finished
    while (!finished) {
        try {
            // Wait for the executor to shut down completely
            if (!executor.isTerminated()) {
                LOG.info("UpdateHostDb: resolver threads waiting: " + Integer.toString(executor.getPoolSize()));
                Thread.sleep(1000);
            } else {
                // All is well, get out
                finished = true;
            }
        } catch (InterruptedException e) {
            // Huh?
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
}

From source file:org.apache.nutch.indexer.DeleteDuplicates.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 1) {
        System.err.println("Usage: DeleteDuplicates <indexes> ...");
        return -1;
    }/*  ww  w .ja v  a2s.c om*/

    Path[] indexes = new Path[args.length];
    for (int i = 0; i < args.length; i++) {
        indexes[i] = new Path(args[i]);
    }
    try {
        dedup(indexes);
        return 0;
    } catch (Exception e) {
        LOG.fatal("DeleteDuplicates: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the Extractor job.  Get outlinks to be converted while ignoring empty
 * and null anchors.// w w w.j  ava 2  s .c o  m
 * 
 * @param webGraphDb The WebGraphDb to pull from.
 * @param output The extractor output.
 * 
 * @throws IOException If an error occurs while running the extractor.
 */
private void runExtractor(Path webGraphDb, Path output) throws IOException {

    JobConf extractor = new NutchJob(getConf());
    extractor.setJobName("AnchorFields Extractor");
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.OUTLINK_DIR));
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.NODE_DIR));
    FileOutputFormat.setOutputPath(extractor, output);
    extractor.setInputFormat(SequenceFileInputFormat.class);
    extractor.setMapperClass(Extractor.class);
    extractor.setReducerClass(Extractor.class);
    extractor.setMapOutputKeyClass(Text.class);
    extractor.setMapOutputValueClass(ObjectWritable.class);
    extractor.setOutputKeyClass(Text.class);
    extractor.setOutputValueClass(LinkDatum.class);
    extractor.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting extractor job");
    try {
        JobClient.runJob(extractor);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished extractor job.");
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the collector job.  Aggregates extracted inlinks, sorts and converts
 * the highest scoring into FieldWritable objects.  Only inlinks for which
 * basic fields exist will be collected to avoid orphan fields.
 * //from w  ww  .j a va2s  . com
 * @param basicFields The BasicFields which must be present to collect anchors
 * to avoid orphan fields.
 * @param links The outlinks path.
 * @param output The collector output.
 * 
 * @throws IOException If an error occurs while running the collector.
 */
private void runCollector(Path basicFields, Path links, Path output) throws IOException {

    JobConf collector = new NutchJob(getConf());
    collector.setJobName("AnchorFields Collector");
    FileInputFormat.addInputPath(collector, links);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting collector job");
    try {
        JobClient.runJob(collector);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished collector job.");
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the AnchorFields job.//from w  ww .j  a va2  s. c  o  m
 */
public int run(String[] args) throws Exception {

    Options options = new Options();
    Option helpOpts = OptionBuilder.withArgName("help").withDescription("show this help message")
            .create("help");
    Option outputOpts = OptionBuilder.withArgName("output").hasArg()
            .withDescription("the output index directory").create("output");
    Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
            .withDescription("the webgraphdb to use").create("webgraphdb");
    Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArgs()
            .withDescription("the basicfields to use").create("basicfields");
    options.addOption(helpOpts);
    options.addOption(webGraphDbOpts);
    options.addOption(basicFieldOpts);
    options.addOption(outputOpts);

    CommandLineParser parser = new GnuParser();
    try {

        CommandLine line = parser.parse(options, args);
        if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("output")
                || !line.hasOption("basicfields")) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("AnchorFields", options);
            return -1;
        }

        String webGraphDb = line.getOptionValue("webgraphdb");
        String output = line.getOptionValue("output");
        String basicFields = line.getOptionValue("basicfields");

        createFields(new Path(webGraphDb), new Path(basicFields), new Path(output));
        return 0;
    } catch (Exception e) {
        LOG.fatal("AnchorFields: " + StringUtils.stringifyException(e));
        return -2;
    }
}