Example usage for org.apache.hadoop.util StringUtils stringifyException

Introduction

In this page you can find the example usage for org.apache.hadoop.util StringUtils stringifyException.

Prototype

public static String stringifyException(Throwable e)

Source Link

Document

Make a string representation of the exception.

Usage

From source file:org.apache.nutch.host.HostInjectorJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage: HostInjectorJob <host_dir>");
        return -1;
    }// w w w . j  a v a 2  s . co  m
    try {
        boolean success = inject(new Path(args[0]));
        if (!success) {
            LOG.error("HostInjectorJob: failed ");
            return -1;
        }
        LOG.info("HostInjectorJob: finished");
        return -0;
    } catch (Exception e) {
        LOG.error("HostInjectorJob: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:org.apache.nutch.hostdb.ReadHostDb.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println(/*from  w  w  w.j a va 2  s. c  o m*/
                "Usage: ReadHostDb <hostdb> [-get <url>] [<output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]]");
        return -1;
    }

    boolean dumpHomepages = false;
    boolean dumpHostnames = false;
    String expr = null;
    String get = null;

    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-dumpHomepages")) {
            LOG.info("ReadHostDb: dumping homepage URL's");
            dumpHomepages = true;
        }
        if (args[i].equals("-dumpHostnames")) {
            LOG.info("ReadHostDb: dumping hostnames");
            dumpHostnames = true;
        }
        if (args[i].equals("-get")) {
            get = args[i + 1];
            LOG.info("ReadHostDb: get: " + get);
            i++;
        }
        if (args[i].equals("-expr")) {
            expr = args[i + 1];
            LOG.info("ReadHostDb: evaluating expression: " + expr);
            i++;
        }
    }

    try {
        if (get != null) {
            getHostDbRecord(new Path(args[0], "current"), get);
        } else {
            readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
        }
        return 0;
    } catch (Exception e) {
        LOG.error("ReadHostDb: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:org.apache.nutch.hostdb.ResolverThread.java

License:Apache License

/**
 *
 *///from w  w w.ja v a  2 s.c  o m
public void run() {
    // Resolve the host and act appropriatly
    try {
        // Throws an exception if host is not found
        @SuppressWarnings("unused")
        InetAddress inetAddr = InetAddress.getByName(host);

        if (datum.isEmpty()) {
            context.getCounter("UpdateHostDb", "new_known_host").increment(1);
            datum.setLastCheck();
            LOG.info(host + ": new_known_host " + datum);
        } else if (datum.getDnsFailures() > 0) {
            context.getCounter("UpdateHostDb", "rediscovered_host").increment(1);
            datum.setLastCheck();
            datum.setDnsFailures(0l);
            LOG.info(host + ": rediscovered_host " + datum);
        } else {
            context.getCounter("UpdateHostDb", "existing_known_host").increment(1);
            datum.setLastCheck();
            LOG.info(host + ": existing_known_host " + datum);
        }

        // Write the host datum
        context.write(hostText, datum);
    } catch (UnknownHostException e) {
        try {
            // If the counter is empty we'll initialize with date = today and 1 failure
            if (datum.isEmpty()) {
                datum.setLastCheck();
                datum.setDnsFailures(1l);
                context.write(hostText, datum);
                context.getCounter("UpdateHostDb", "new_unknown_host").increment(1);
                LOG.info(host + ": new_unknown_host " + datum);
            } else {
                datum.setLastCheck();
                datum.incDnsFailures();

                // Check if this host should be forgotten
                if (purgeFailedHostsThreshold == -1 || purgeFailedHostsThreshold < datum.getDnsFailures()) {

                    context.write(hostText, datum);
                    context.getCounter("UpdateHostDb", "existing_unknown_host").increment(1);
                    LOG.info(host + ": existing_unknown_host " + datum);
                } else {
                    context.getCounter("UpdateHostDb", "purged_unknown_host").increment(1);
                    LOG.info(host + ": purged_unknown_host " + datum);
                }
            }

            context.getCounter("UpdateHostDb", Long.toString(datum.numFailures()) + "_times_failed")
                    .increment(1);
        } catch (Exception ioe) {
            LOG.warn(StringUtils.stringifyException(ioe));
        }
    } catch (Exception e) {
        LOG.warn(StringUtils.stringifyException(e));
    }

    context.getCounter("UpdateHostDb", "checked_hosts").increment(1);
}

From source file:org.apache.nutch.hostdb.UpdateHostDb.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: UpdateHostDb -hostdb <hostdb> "
                + "[-tophosts <tophosts>] [-crawldb <crawldb>] [-checkAll] [-checkFailed]"
                + " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize]");
        return -1;
    }//from w  ww.  j  a v a 2 s. c o m

    Path hostDb = null;
    Path crawlDb = null;
    Path topHosts = null;

    boolean checkFailed = false;
    boolean checkNew = false;
    boolean checkKnown = false;
    boolean force = false;

    boolean filter = false;
    boolean normalize = false;

    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-hostdb")) {
            hostDb = new Path(args[i + 1]);
            LOG.info("UpdateHostDb: hostdb: " + hostDb);
            i++;
        }
        if (args[i].equals("-crawldb")) {
            crawlDb = new Path(args[i + 1]);
            LOG.info("UpdateHostDb: crawldb: " + crawlDb);
            i++;
        }
        if (args[i].equals("-tophosts")) {
            topHosts = new Path(args[i + 1]);
            LOG.info("UpdateHostDb: tophosts: " + topHosts);
            i++;
        }

        if (args[i].equals("-checkFailed")) {
            LOG.info("UpdateHostDb: checking failed hosts");
            checkFailed = true;
        }
        if (args[i].equals("-checkNew")) {
            LOG.info("UpdateHostDb: checking new hosts");
            checkNew = true;
        }
        if (args[i].equals("-checkKnown")) {
            LOG.info("UpdateHostDb: checking known hosts");
            checkKnown = true;
        }
        if (args[i].equals("-checkAll")) {
            LOG.info("UpdateHostDb: checking all hosts");
            checkFailed = true;
            checkNew = true;
            checkKnown = true;
        }
        if (args[i].equals("-force")) {
            LOG.info("UpdateHostDb: forced check");
            force = true;
        }
        if (args[i].equals("-filter")) {
            LOG.info("UpdateHostDb: filtering enabled");
            filter = true;
        }
        if (args[i].equals("-normalize")) {
            LOG.info("UpdateHostDb: normalizing enabled");
            normalize = true;
        }
    }

    if (hostDb == null) {
        System.err.println("hostDb is mandatory");
        return -1;
    }

    try {
        updateHostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew, checkKnown, force, filter, normalize);

        return 0;
    } catch (Exception e) {
        LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:org.apache.nutch.hostdb.UpdateHostDbReducer.java

License:Apache License

/**
  *//  www  .  j  a va  2s  . com
  */
@Override
public void reduce(Text key, Iterable<NutchWritable> values, Context context)
        throws IOException, InterruptedException {

    Map<String, Map<String, Long>> stringCounts = new HashMap<>();
    Map<String, Float> maximums = new HashMap<>();
    Map<String, Float> sums = new HashMap<>(); // used to calc averages
    Map<String, Long> counts = new HashMap<>(); // used to calc averages
    Map<String, Float> minimums = new HashMap<>();
    Map<String, TDigest> tdigests = new HashMap<String, TDigest>();

    HostDatum hostDatum = new HostDatum();
    float score = 0;

    if (stringFields != null) {
        for (int i = 0; i < stringFields.length; i++) {
            stringCounts.put(stringFields[i], new HashMap<>());
        }
    }

    // Loop through all values until we find a non-empty HostDatum or use
    // an empty if this is a new host for the host db
    for (NutchWritable val : values) {
        final Writable value = val.get(); // unwrap

        // Count crawl datum status's and collect metadata from fields
        if (value instanceof CrawlDatum) {
            CrawlDatum buffer = (CrawlDatum) value;

            // Set the correct status field
            switch (buffer.getStatus()) {
            case CrawlDatum.STATUS_DB_UNFETCHED:
                hostDatum.setUnfetched(hostDatum.getUnfetched() + 1l);
                break;

            case CrawlDatum.STATUS_DB_FETCHED:
                hostDatum.setFetched(hostDatum.getFetched() + 1l);
                break;

            case CrawlDatum.STATUS_DB_GONE:
                hostDatum.setGone(hostDatum.getGone() + 1l);
                break;

            case CrawlDatum.STATUS_DB_REDIR_TEMP:
                hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1l);
                break;

            case CrawlDatum.STATUS_DB_REDIR_PERM:
                hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1l);
                break;

            case CrawlDatum.STATUS_DB_NOTMODIFIED:
                hostDatum.setNotModified(hostDatum.getNotModified() + 1l);
                break;
            }

            // Record connection failures
            if (buffer.getRetriesSinceFetch() != 0) {
                hostDatum.incConnectionFailures();
            }

            // Only gather metadata statistics for proper fetched pages
            if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED
                    || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
                // Deal with the string fields
                if (stringFields != null) {
                    for (int i = 0; i < stringFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
                            // Get it!
                            String metadataValue = null;
                            try {
                                metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
                            } catch (Exception e) {
                                LOG.error("Metadata field " + stringFields[i]
                                        + " is probably not a numeric value");
                            }

                            // Does the value exist?
                            if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
                                // Yes, increment it
                                stringCounts.get(stringFields[i]).put(metadataValue,
                                        stringCounts.get(stringFields[i]).get(metadataValue) + 1l);
                            } else {
                                // Create it!
                                stringCounts.get(stringFields[i]).put(metadataValue, 1l);
                            }
                        }
                    }
                }

                // Deal with the numeric fields
                if (numericFields != null) {
                    for (int i = 0; i < numericFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
                            try {
                                // Get it!
                                Float metadataValue = Float.parseFloat(
                                        buffer.getMetaData().get(numericFieldWritables[i]).toString());

                                // Does the median value exist?
                                if (tdigests.containsKey(numericFields[i])) {
                                    tdigests.get(numericFields[i]).add(metadataValue);
                                } else {
                                    // Create it!
                                    TDigest tdigest = TDigest.createDigest(100);
                                    tdigest.add((double) metadataValue);
                                    tdigests.put(numericFields[i], tdigest);
                                }

                                // Does the minimum value exist?
                                if (minimums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue < minimums.get(numericFields[i])) {
                                        minimums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    minimums.put(numericFields[i], metadataValue);
                                }

                                // Does the maximum value exist?
                                if (maximums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue > maximums.get(numericFields[i])) {
                                        maximums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    maximums.put(numericFields[i], metadataValue);
                                }

                                // Sum it up!
                                if (sums.containsKey(numericFields[i])) {
                                    // Increment
                                    sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
                                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1l);
                                } else {
                                    // Create it!
                                    sums.put(numericFields[i], metadataValue);
                                    counts.put(numericFields[i], 1l);
                                }
                            } catch (Exception e) {
                                LOG.error(e.getMessage() + " when processing values for " + key.toString());
                            }
                        }
                    }
                }
            }
        }

        // 
        else if (value instanceof HostDatum) {
            HostDatum buffer = (HostDatum) value;

            // Check homepage URL
            if (buffer.hasHomepageUrl()) {
                hostDatum.setHomepageUrl(buffer.getHomepageUrl());
            }

            // Check lastCheck timestamp
            if (!buffer.isEmpty()) {
                hostDatum.setLastCheck(buffer.getLastCheck());
            }

            // Check and set DNS failures
            if (buffer.getDnsFailures() > 0) {
                hostDatum.setDnsFailures(buffer.getDnsFailures());
            }

            // Check and set connection failures
            if (buffer.getConnectionFailures() > 0) {
                hostDatum.setConnectionFailures(buffer.getConnectionFailures());
            }

            // Check metadata
            if (!buffer.getMetaData().isEmpty()) {
                hostDatum.setMetaData(buffer.getMetaData());
            }

            // Check and set score (score from Web Graph has precedence)
            if (buffer.getScore() > 0) {
                hostDatum.setScore(buffer.getScore());
            }
        }

        // Check for the score
        else if (value instanceof FloatWritable) {
            FloatWritable buffer = (FloatWritable) value;
            score = buffer.get();
        } else {
            LOG.error("Class {} not handled", value.getClass());
        }
    }

    // Check if score was set from Web Graph
    if (score > 0) {
        hostDatum.setScore(score);
    }

    // Set metadata
    for (Map.Entry<String, Map<String, Long>> entry : stringCounts.entrySet()) {
        for (Map.Entry<String, Long> subEntry : entry.getValue().entrySet()) {
            hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()),
                    new LongWritable(subEntry.getValue()));
        }
    }
    for (Map.Entry<String, Float> entry : maximums.entrySet()) {
        hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
    }
    for (Map.Entry<String, Float> entry : sums.entrySet()) {
        hostDatum.getMetaData().put(new Text("avg." + entry.getKey()),
                new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
    }
    for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
        // Emit all percentiles
        for (int i = 0; i < percentiles.length; i++) {
            hostDatum.getMetaData().put(new Text("pct" + Long.toString(percentiles[i]) + "." + entry.getKey()),
                    new FloatWritable((float) entry.getValue().quantile(0.5)));
        }
    }
    for (Map.Entry<String, Float> entry : minimums.entrySet()) {
        hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
    }

    context.getCounter("UpdateHostDb", "total_hosts").increment(1);

    // See if this record is to be checked
    if (shouldCheck(hostDatum)) {
        // Make an entry
        resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold);

        // Add the entry to the queue (blocking)
        try {
            queue.put(resolverThread);
        } catch (InterruptedException e) {
            LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
        }

        // Do not progress, the datum will be written in the resolver thread
        return;
    } else {
        context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
        LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
    }

    // Write the host datum if it wasn't written by the resolver thread
    context.write(key, hostDatum);
}

From source file:org.apache.nutch.hostdb.UpdateHostDbReducer.java

License:Apache License

/**
  * Shut down all running threads and wait for completion.
  *///ww w .j  a  v  a  2s  . c  o m
@Override
public void cleanup(Context context) {
    LOG.info("UpdateHostDb: feeder finished, waiting for shutdown");

    // If we're here all keys have been fed and we can issue a shut down
    executor.shutdown();

    boolean finished = false;

    // Wait until all resolvers have finished
    while (!finished) {
        try {
            // Wait for the executor to shut down completely
            if (!executor.isTerminated()) {
                LOG.info("UpdateHostDb: resolver threads waiting: " + Integer.toString(executor.getPoolSize()));
                Thread.sleep(1000);
            } else {
                // All is well, get out
                finished = true;
            }
        } catch (InterruptedException e) {
            // Huh?
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
}

From source file:org.apache.nutch.indexer.DeleteDuplicates.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 1) {
        System.err.println("Usage: DeleteDuplicates <indexes> ...");
        return -1;
    }/*  ww  w .ja v  a2s.c om*/

    Path[] indexes = new Path[args.length];
    for (int i = 0; i < args.length; i++) {
        indexes[i] = new Path(args[i]);
    }
    try {
        dedup(indexes);
        return 0;
    } catch (Exception e) {
        LOG.fatal("DeleteDuplicates: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the Extractor job.  Get outlinks to be converted while ignoring empty
 * and null anchors.// w w w.j  ava 2  s .c o  m
 * 
 * @param webGraphDb The WebGraphDb to pull from.
 * @param output The extractor output.
 * 
 * @throws IOException If an error occurs while running the extractor.
 */
private void runExtractor(Path webGraphDb, Path output) throws IOException {

    JobConf extractor = new NutchJob(getConf());
    extractor.setJobName("AnchorFields Extractor");
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.OUTLINK_DIR));
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.NODE_DIR));
    FileOutputFormat.setOutputPath(extractor, output);
    extractor.setInputFormat(SequenceFileInputFormat.class);
    extractor.setMapperClass(Extractor.class);
    extractor.setReducerClass(Extractor.class);
    extractor.setMapOutputKeyClass(Text.class);
    extractor.setMapOutputValueClass(ObjectWritable.class);
    extractor.setOutputKeyClass(Text.class);
    extractor.setOutputValueClass(LinkDatum.class);
    extractor.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting extractor job");
    try {
        JobClient.runJob(extractor);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished extractor job.");
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the collector job.  Aggregates extracted inlinks, sorts and converts
 * the highest scoring into FieldWritable objects.  Only inlinks for which
 * basic fields exist will be collected to avoid orphan fields.
 * //from w  ww  .j a va2s  . com
 * @param basicFields The BasicFields which must be present to collect anchors
 * to avoid orphan fields.
 * @param links The outlinks path.
 * @param output The collector output.
 * 
 * @throws IOException If an error occurs while running the collector.
 */
private void runCollector(Path basicFields, Path links, Path output) throws IOException {

    JobConf collector = new NutchJob(getConf());
    collector.setJobName("AnchorFields Collector");
    FileInputFormat.addInputPath(collector, links);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting collector job");
    try {
        JobClient.runJob(collector);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished collector job.");
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the AnchorFields job.//from w  ww .j  a va2  s. c  o  m
 */
public int run(String[] args) throws Exception {

    Options options = new Options();
    Option helpOpts = OptionBuilder.withArgName("help").withDescription("show this help message")
            .create("help");
    Option outputOpts = OptionBuilder.withArgName("output").hasArg()
            .withDescription("the output index directory").create("output");
    Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
            .withDescription("the webgraphdb to use").create("webgraphdb");
    Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArgs()
            .withDescription("the basicfields to use").create("basicfields");
    options.addOption(helpOpts);
    options.addOption(webGraphDbOpts);
    options.addOption(basicFieldOpts);
    options.addOption(outputOpts);

    CommandLineParser parser = new GnuParser();
    try {

        CommandLine line = parser.parse(options, args);
        if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("output")
                || !line.hasOption("basicfields")) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("AnchorFields", options);
            return -1;
        }

        String webGraphDb = line.getOptionValue("webgraphdb");
        String output = line.getOptionValue("output");
        String basicFields = line.getOptionValue("basicfields");

        createFields(new Path(webGraphDb), new Path(basicFields), new Path(output));
        return 0;
    } catch (Exception e) {
        LOG.fatal("AnchorFields: " + StringUtils.stringifyException(e));
        return -2;
    }
}