List of usage examples for org.apache.hadoop.util StringUtils stringifyException
public static String stringifyException(Throwable e)
From source file:org.apache.nutch.host.HostInjectorJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: HostInjectorJob <host_dir>"); return -1; }// w w w . j a v a 2 s . co m try { boolean success = inject(new Path(args[0])); if (!success) { LOG.error("HostInjectorJob: failed "); return -1; } LOG.info("HostInjectorJob: finished"); return -0; } catch (Exception e) { LOG.error("HostInjectorJob: " + StringUtils.stringifyException(e)); return -1; } }
From source file:org.apache.nutch.hostdb.ReadHostDb.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println(/*from w w w.j a va 2 s. c o m*/ "Usage: ReadHostDb <hostdb> [-get <url>] [<output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]]"); return -1; } boolean dumpHomepages = false; boolean dumpHostnames = false; String expr = null; String get = null; for (int i = 0; i < args.length; i++) { if (args[i].equals("-dumpHomepages")) { LOG.info("ReadHostDb: dumping homepage URL's"); dumpHomepages = true; } if (args[i].equals("-dumpHostnames")) { LOG.info("ReadHostDb: dumping hostnames"); dumpHostnames = true; } if (args[i].equals("-get")) { get = args[i + 1]; LOG.info("ReadHostDb: get: " + get); i++; } if (args[i].equals("-expr")) { expr = args[i + 1]; LOG.info("ReadHostDb: evaluating expression: " + expr); i++; } } try { if (get != null) { getHostDbRecord(new Path(args[0], "current"), get); } else { readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr); } return 0; } catch (Exception e) { LOG.error("ReadHostDb: " + StringUtils.stringifyException(e)); return -1; } }
From source file:org.apache.nutch.hostdb.ResolverThread.java
License:Apache License
/** * *///from w w w.ja v a 2 s.c o m public void run() { // Resolve the host and act appropriatly try { // Throws an exception if host is not found @SuppressWarnings("unused") InetAddress inetAddr = InetAddress.getByName(host); if (datum.isEmpty()) { context.getCounter("UpdateHostDb", "new_known_host").increment(1); datum.setLastCheck(); LOG.info(host + ": new_known_host " + datum); } else if (datum.getDnsFailures() > 0) { context.getCounter("UpdateHostDb", "rediscovered_host").increment(1); datum.setLastCheck(); datum.setDnsFailures(0l); LOG.info(host + ": rediscovered_host " + datum); } else { context.getCounter("UpdateHostDb", "existing_known_host").increment(1); datum.setLastCheck(); LOG.info(host + ": existing_known_host " + datum); } // Write the host datum context.write(hostText, datum); } catch (UnknownHostException e) { try { // If the counter is empty we'll initialize with date = today and 1 failure if (datum.isEmpty()) { datum.setLastCheck(); datum.setDnsFailures(1l); context.write(hostText, datum); context.getCounter("UpdateHostDb", "new_unknown_host").increment(1); LOG.info(host + ": new_unknown_host " + datum); } else { datum.setLastCheck(); datum.incDnsFailures(); // Check if this host should be forgotten if (purgeFailedHostsThreshold == -1 || purgeFailedHostsThreshold < datum.getDnsFailures()) { context.write(hostText, datum); context.getCounter("UpdateHostDb", "existing_unknown_host").increment(1); LOG.info(host + ": existing_unknown_host " + datum); } else { context.getCounter("UpdateHostDb", "purged_unknown_host").increment(1); LOG.info(host + ": purged_unknown_host " + datum); } } context.getCounter("UpdateHostDb", Long.toString(datum.numFailures()) + "_times_failed") .increment(1); } catch (Exception ioe) { LOG.warn(StringUtils.stringifyException(ioe)); } } catch (Exception e) { LOG.warn(StringUtils.stringifyException(e)); } context.getCounter("UpdateHostDb", "checked_hosts").increment(1); }
From source file:org.apache.nutch.hostdb.UpdateHostDb.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: UpdateHostDb -hostdb <hostdb> " + "[-tophosts <tophosts>] [-crawldb <crawldb>] [-checkAll] [-checkFailed]" + " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize]"); return -1; }//from w ww. j a v a 2 s. c o m Path hostDb = null; Path crawlDb = null; Path topHosts = null; boolean checkFailed = false; boolean checkNew = false; boolean checkKnown = false; boolean force = false; boolean filter = false; boolean normalize = false; for (int i = 0; i < args.length; i++) { if (args[i].equals("-hostdb")) { hostDb = new Path(args[i + 1]); LOG.info("UpdateHostDb: hostdb: " + hostDb); i++; } if (args[i].equals("-crawldb")) { crawlDb = new Path(args[i + 1]); LOG.info("UpdateHostDb: crawldb: " + crawlDb); i++; } if (args[i].equals("-tophosts")) { topHosts = new Path(args[i + 1]); LOG.info("UpdateHostDb: tophosts: " + topHosts); i++; } if (args[i].equals("-checkFailed")) { LOG.info("UpdateHostDb: checking failed hosts"); checkFailed = true; } if (args[i].equals("-checkNew")) { LOG.info("UpdateHostDb: checking new hosts"); checkNew = true; } if (args[i].equals("-checkKnown")) { LOG.info("UpdateHostDb: checking known hosts"); checkKnown = true; } if (args[i].equals("-checkAll")) { LOG.info("UpdateHostDb: checking all hosts"); checkFailed = true; checkNew = true; checkKnown = true; } if (args[i].equals("-force")) { LOG.info("UpdateHostDb: forced check"); force = true; } if (args[i].equals("-filter")) { LOG.info("UpdateHostDb: filtering enabled"); filter = true; } if (args[i].equals("-normalize")) { LOG.info("UpdateHostDb: normalizing enabled"); normalize = true; } } if (hostDb == null) { System.err.println("hostDb is mandatory"); return -1; } try { updateHostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew, checkKnown, force, filter, normalize); return 0; } catch (Exception e) { LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e)); return -1; } }
From source file:org.apache.nutch.hostdb.UpdateHostDbReducer.java
License:Apache License
/** *// www . j a va 2s . com */ @Override public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException { Map<String, Map<String, Long>> stringCounts = new HashMap<>(); Map<String, Float> maximums = new HashMap<>(); Map<String, Float> sums = new HashMap<>(); // used to calc averages Map<String, Long> counts = new HashMap<>(); // used to calc averages Map<String, Float> minimums = new HashMap<>(); Map<String, TDigest> tdigests = new HashMap<String, TDigest>(); HostDatum hostDatum = new HostDatum(); float score = 0; if (stringFields != null) { for (int i = 0; i < stringFields.length; i++) { stringCounts.put(stringFields[i], new HashMap<>()); } } // Loop through all values until we find a non-empty HostDatum or use // an empty if this is a new host for the host db for (NutchWritable val : values) { final Writable value = val.get(); // unwrap // Count crawl datum status's and collect metadata from fields if (value instanceof CrawlDatum) { CrawlDatum buffer = (CrawlDatum) value; // Set the correct status field switch (buffer.getStatus()) { case CrawlDatum.STATUS_DB_UNFETCHED: hostDatum.setUnfetched(hostDatum.getUnfetched() + 1l); break; case CrawlDatum.STATUS_DB_FETCHED: hostDatum.setFetched(hostDatum.getFetched() + 1l); break; case CrawlDatum.STATUS_DB_GONE: hostDatum.setGone(hostDatum.getGone() + 1l); break; case CrawlDatum.STATUS_DB_REDIR_TEMP: hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1l); break; case CrawlDatum.STATUS_DB_REDIR_PERM: hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1l); break; case CrawlDatum.STATUS_DB_NOTMODIFIED: hostDatum.setNotModified(hostDatum.getNotModified() + 1l); break; } // Record connection failures if (buffer.getRetriesSinceFetch() != 0) { hostDatum.incConnectionFailures(); } // Only gather metadata statistics for proper fetched pages if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { // Deal with the string fields if (stringFields != null) { for (int i = 0; i < stringFields.length; i++) { // Does this field exist? if (buffer.getMetaData().get(stringFieldWritables[i]) != null) { // Get it! String metadataValue = null; try { metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString(); } catch (Exception e) { LOG.error("Metadata field " + stringFields[i] + " is probably not a numeric value"); } // Does the value exist? if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) { // Yes, increment it stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1l); } else { // Create it! stringCounts.get(stringFields[i]).put(metadataValue, 1l); } } } } // Deal with the numeric fields if (numericFields != null) { for (int i = 0; i < numericFields.length; i++) { // Does this field exist? if (buffer.getMetaData().get(numericFieldWritables[i]) != null) { try { // Get it! Float metadataValue = Float.parseFloat( buffer.getMetaData().get(numericFieldWritables[i]).toString()); // Does the median value exist? if (tdigests.containsKey(numericFields[i])) { tdigests.get(numericFields[i]).add(metadataValue); } else { // Create it! TDigest tdigest = TDigest.createDigest(100); tdigest.add((double) metadataValue); tdigests.put(numericFields[i], tdigest); } // Does the minimum value exist? if (minimums.containsKey(numericFields[i])) { // Write if this is lower than existing value if (metadataValue < minimums.get(numericFields[i])) { minimums.put(numericFields[i], metadataValue); } } else { // Create it! minimums.put(numericFields[i], metadataValue); } // Does the maximum value exist? if (maximums.containsKey(numericFields[i])) { // Write if this is lower than existing value if (metadataValue > maximums.get(numericFields[i])) { maximums.put(numericFields[i], metadataValue); } } else { // Create it! maximums.put(numericFields[i], metadataValue); } // Sum it up! if (sums.containsKey(numericFields[i])) { // Increment sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue); counts.put(numericFields[i], counts.get(numericFields[i]) + 1l); } else { // Create it! sums.put(numericFields[i], metadataValue); counts.put(numericFields[i], 1l); } } catch (Exception e) { LOG.error(e.getMessage() + " when processing values for " + key.toString()); } } } } } } // else if (value instanceof HostDatum) { HostDatum buffer = (HostDatum) value; // Check homepage URL if (buffer.hasHomepageUrl()) { hostDatum.setHomepageUrl(buffer.getHomepageUrl()); } // Check lastCheck timestamp if (!buffer.isEmpty()) { hostDatum.setLastCheck(buffer.getLastCheck()); } // Check and set DNS failures if (buffer.getDnsFailures() > 0) { hostDatum.setDnsFailures(buffer.getDnsFailures()); } // Check and set connection failures if (buffer.getConnectionFailures() > 0) { hostDatum.setConnectionFailures(buffer.getConnectionFailures()); } // Check metadata if (!buffer.getMetaData().isEmpty()) { hostDatum.setMetaData(buffer.getMetaData()); } // Check and set score (score from Web Graph has precedence) if (buffer.getScore() > 0) { hostDatum.setScore(buffer.getScore()); } } // Check for the score else if (value instanceof FloatWritable) { FloatWritable buffer = (FloatWritable) value; score = buffer.get(); } else { LOG.error("Class {} not handled", value.getClass()); } } // Check if score was set from Web Graph if (score > 0) { hostDatum.setScore(score); } // Set metadata for (Map.Entry<String, Map<String, Long>> entry : stringCounts.entrySet()) { for (Map.Entry<String, Long> subEntry : entry.getValue().entrySet()) { hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new LongWritable(subEntry.getValue())); } } for (Map.Entry<String, Float> entry : maximums.entrySet()) { hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue())); } for (Map.Entry<String, Float> entry : sums.entrySet()) { hostDatum.getMetaData().put(new Text("avg." + entry.getKey()), new FloatWritable(entry.getValue() / counts.get(entry.getKey()))); } for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) { // Emit all percentiles for (int i = 0; i < percentiles.length; i++) { hostDatum.getMetaData().put(new Text("pct" + Long.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float) entry.getValue().quantile(0.5))); } } for (Map.Entry<String, Float> entry : minimums.entrySet()) { hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue())); } context.getCounter("UpdateHostDb", "total_hosts").increment(1); // See if this record is to be checked if (shouldCheck(hostDatum)) { // Make an entry resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold); // Add the entry to the queue (blocking) try { queue.put(resolverThread); } catch (InterruptedException e) { LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e)); } // Do not progress, the datum will be written in the resolver thread return; } else { context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1); LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible"); } // Write the host datum if it wasn't written by the resolver thread context.write(key, hostDatum); }
From source file:org.apache.nutch.hostdb.UpdateHostDbReducer.java
License:Apache License
/** * Shut down all running threads and wait for completion. *///ww w .j a v a 2s . c o m @Override public void cleanup(Context context) { LOG.info("UpdateHostDb: feeder finished, waiting for shutdown"); // If we're here all keys have been fed and we can issue a shut down executor.shutdown(); boolean finished = false; // Wait until all resolvers have finished while (!finished) { try { // Wait for the executor to shut down completely if (!executor.isTerminated()) { LOG.info("UpdateHostDb: resolver threads waiting: " + Integer.toString(executor.getPoolSize())); Thread.sleep(1000); } else { // All is well, get out finished = true; } } catch (InterruptedException e) { // Huh? LOG.warn(StringUtils.stringifyException(e)); } } }
From source file:org.apache.nutch.indexer.DeleteDuplicates.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: DeleteDuplicates <indexes> ..."); return -1; }/* ww w .ja v a2s.c om*/ Path[] indexes = new Path[args.length]; for (int i = 0; i < args.length; i++) { indexes[i] = new Path(args[i]); } try { dedup(indexes); return 0; } catch (Exception e) { LOG.fatal("DeleteDuplicates: " + StringUtils.stringifyException(e)); return -1; } }
From source file:org.apache.nutch.indexer.field.AnchorFields.java
License:Apache License
/** * Runs the Extractor job. Get outlinks to be converted while ignoring empty * and null anchors.// w w w.j ava 2 s .c o m * * @param webGraphDb The WebGraphDb to pull from. * @param output The extractor output. * * @throws IOException If an error occurs while running the extractor. */ private void runExtractor(Path webGraphDb, Path output) throws IOException { JobConf extractor = new NutchJob(getConf()); extractor.setJobName("AnchorFields Extractor"); FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.OUTLINK_DIR)); FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.NODE_DIR)); FileOutputFormat.setOutputPath(extractor, output); extractor.setInputFormat(SequenceFileInputFormat.class); extractor.setMapperClass(Extractor.class); extractor.setReducerClass(Extractor.class); extractor.setMapOutputKeyClass(Text.class); extractor.setMapOutputValueClass(ObjectWritable.class); extractor.setOutputKeyClass(Text.class); extractor.setOutputValueClass(LinkDatum.class); extractor.setOutputFormat(SequenceFileOutputFormat.class); LOG.info("Starting extractor job"); try { JobClient.runJob(extractor); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished extractor job."); }
From source file:org.apache.nutch.indexer.field.AnchorFields.java
License:Apache License
/** * Runs the collector job. Aggregates extracted inlinks, sorts and converts * the highest scoring into FieldWritable objects. Only inlinks for which * basic fields exist will be collected to avoid orphan fields. * //from w ww .j a va2s . com * @param basicFields The BasicFields which must be present to collect anchors * to avoid orphan fields. * @param links The outlinks path. * @param output The collector output. * * @throws IOException If an error occurs while running the collector. */ private void runCollector(Path basicFields, Path links, Path output) throws IOException { JobConf collector = new NutchJob(getConf()); collector.setJobName("AnchorFields Collector"); FileInputFormat.addInputPath(collector, links); FileInputFormat.addInputPath(collector, basicFields); FileOutputFormat.setOutputPath(collector, output); collector.setInputFormat(SequenceFileInputFormat.class); collector.setMapOutputKeyClass(Text.class); collector.setMapOutputValueClass(ObjectWritable.class); collector.setMapperClass(Collector.class); collector.setReducerClass(Collector.class); collector.setOutputKeyClass(Text.class); collector.setOutputValueClass(FieldWritable.class); collector.setOutputFormat(SequenceFileOutputFormat.class); LOG.info("Starting collector job"); try { JobClient.runJob(collector); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished collector job."); }
From source file:org.apache.nutch.indexer.field.AnchorFields.java
License:Apache License
/** * Runs the AnchorFields job.//from w ww .j a va2 s. c o m */ public int run(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription("show this help message") .create("help"); Option outputOpts = OptionBuilder.withArgName("output").hasArg() .withDescription("the output index directory").create("output"); Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg() .withDescription("the webgraphdb to use").create("webgraphdb"); Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArgs() .withDescription("the basicfields to use").create("basicfields"); options.addOption(helpOpts); options.addOption(webGraphDbOpts); options.addOption(basicFieldOpts); options.addOption(outputOpts); CommandLineParser parser = new GnuParser(); try { CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("output") || !line.hasOption("basicfields")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("AnchorFields", options); return -1; } String webGraphDb = line.getOptionValue("webgraphdb"); String output = line.getOptionValue("output"); String basicFields = line.getOptionValue("basicfields"); createFields(new Path(webGraphDb), new Path(basicFields), new Path(output)); return 0; } catch (Exception e) { LOG.fatal("AnchorFields: " + StringUtils.stringifyException(e)); return -2; } }