List of usage examples for org.apache.hadoop.mapreduce Counter getValue
long getValue();
From source file:edu.umn.cs.sthadoop.trajectory.TrajectoryOverlap.java
License:Open Source License
public static void main(String[] args) throws Exception { // args = new String[8]; // args[0] = "/export/scratch/mntgData/geolifeGPS/geolife_Trajectories_1.3/HDFS/index_geolife"; // args[1] = "/export/scratch/mntgData/geolifeGPS/geolife_Trajectories_1.3/HDFS/knn-dis-result"; // args[2] = "shape:edu.umn.cs.sthadoop.trajectory.GeolifeTrajectory"; // args[3] = "interval:2008-05-01,2008-05-30"; // args[4] = "time:month"; // args[5] = "traj:39.9119983,116.606835;39.9119783,116.6065483;39.9119599,116.6062649;39.9119416,116.6059899;39.9119233,116.6057282;39.9118999,116.6054783;39.9118849,116.6052366;39.9118666,116.6050099;39.91185,116.604775;39.9118299,116.604525;39.9118049,116.6042649;39.91177,116.6040166;39.9117516,116.6037583;39.9117349,116.6035066;39.9117199,116.6032666;39.9117083,116.6030232;39.9117,116.6027566;39.91128,116.5969383;39.9112583,116.5966766;39.9112383,116.5964232;39.9112149,116.5961699;39.9111933,116.5959249;39.9111716,116.5956883"; // args[6] = "-overwrite"; // args[7] = "-local";//"-no-local"; final OperationsParams params = new OperationsParams(new GenericOptionsParser(args)); final Path[] paths = params.getPaths(); if (paths.length <= 1 && !params.checkInput()) { printUsage();// ww w.j av a2 s.c o m System.exit(1); } if (paths.length >= 2 && !params.checkInputOutput()) { printUsage(); System.exit(1); } if (params.get("traj") == null) { System.err.println("Trajectory query is missing"); printUsage(); System.exit(1); } // Invoke method to compute the trajectory MBR. String rectangle = getTrajectoryRectangle(params.get("traj")); params.set("rect", rectangle); if (params.get("rect") == null) { System.err.println("You must provide a Trajectory Query"); printUsage(); System.exit(1); } if (params.get("interval") == null) { System.err.println("Temporal range missing"); printUsage(); System.exit(1); } TextSerializable inObj = params.getShape("shape"); if (!(inObj instanceof STPoint)) { LOG.error("Shape is not instance of STPoint"); printUsage(); System.exit(1); } // Get spatio-temporal slices. List<Path> STPaths = getIndexedSlices(params); final Path outPath = params.getOutputPath(); final Rectangle[] queryRanges = params.getShapes("rect", new Rectangle()); // All running jobs final Vector<Long> resultsCounts = new Vector<Long>(); Vector<Job> jobs = new Vector<Job>(); Vector<Thread> threads = new Vector<Thread>(); long t1 = System.currentTimeMillis(); for (Path stPath : STPaths) { final Path inPath = stPath; for (int i = 0; i < queryRanges.length; i++) { final OperationsParams queryParams = new OperationsParams(params); OperationsParams.setShape(queryParams, "rect", queryRanges[i]); if (OperationsParams.isLocal(new JobConf(queryParams), inPath)) { // Run in local mode final Rectangle queryRange = queryRanges[i]; final Shape shape = queryParams.getShape("shape"); final Path output = outPath == null ? null : (queryRanges.length == 1 ? outPath : new Path(outPath, String.format("%05d", i))); Thread thread = new Thread() { @Override public void run() { FSDataOutputStream outFile = null; final byte[] newLine = System.getProperty("line.separator", "\n").getBytes(); try { ResultCollector<Shape> collector = null; if (output != null) { FileSystem outFS = output.getFileSystem(queryParams); final FSDataOutputStream foutFile = outFile = outFS.create(output); collector = new ResultCollector<Shape>() { final Text tempText = new Text2(); @Override public synchronized void collect(Shape r) { try { tempText.clear(); r.toText(tempText); foutFile.write(tempText.getBytes(), 0, tempText.getLength()); foutFile.write(newLine); } catch (IOException e) { e.printStackTrace(); } } }; } else { outFile = null; } long resultCount = rangeQueryLocal(inPath, queryRange, shape, queryParams, collector); resultsCounts.add(resultCount); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } finally { try { if (outFile != null) outFile.close(); } catch (IOException e) { e.printStackTrace(); } } } }; thread.start(); threads.add(thread); } else { // Run in MapReduce mode Path outTempPath = outPath == null ? null : new Path(outPath, String.format("%05d", i) + "-" + inPath.getName()); queryParams.setBoolean("background", true); Job job = rangeQueryMapReduce(inPath, outTempPath, queryParams); jobs.add(job); } } } while (!jobs.isEmpty()) { Job firstJob = jobs.firstElement(); firstJob.waitForCompletion(false); if (!firstJob.isSuccessful()) { System.err.println("Error running job " + firstJob); System.err.println("Killing all remaining jobs"); for (int j = 1; j < jobs.size(); j++) jobs.get(j).killJob(); System.exit(1); } Counters counters = firstJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); resultsCounts.add(outputRecordCounter.getValue()); jobs.remove(0); } while (!threads.isEmpty()) { try { Thread thread = threads.firstElement(); thread.join(); threads.remove(0); } catch (InterruptedException e) { e.printStackTrace(); } } long t2 = System.currentTimeMillis(); System.out.println("QueryPlan:"); for (Path stPath : STPaths) { System.out.println(stPath.getName()); } System.out.println("Time for " + queryRanges.length + " jobs is " + (t2 - t1) + " millis"); System.out.println("Results counts: " + resultsCounts); }
From source file:gaffer.accumulo.splitpoints.EstimateSplitPointsDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 5) { System.err.println("Usage: " + this.getClass().getName() + " <mapred_output_directory> <proportion_to_sample> <number_of_tablet_servers> <resulting_split_file> <input_path1>..."); return 1; }/* w w w .j av a 2 s . c om*/ // Parse arguments Path outputPath = new Path(args[0]); float proportionToSample = Float.parseFloat(args[1]); int numberTabletServers = Integer.parseInt(args[2]); Path resultingSplitsFile = new Path(args[3]); Path[] inputPaths = new Path[args.length - 4]; for (int i = 0; i < inputPaths.length; i++) { inputPaths[i] = new Path(args[i + 4]); } // Conf and job Configuration conf = getConf(); conf.setFloat("proportion_to_sample", proportionToSample); String jobName = "Estimate split points: input = "; for (int i = 0; i < inputPaths.length; i++) { jobName += inputPaths[i] + ", "; } jobName += "output = " + outputPath; Job job = Job.getInstance(conf, jobName); job.setJarByClass(getClass()); // Input job.setInputFormatClass(SequenceFileInputFormat.class); for (int i = 0; i < inputPaths.length; i++) { SequenceFileInputFormat.addInputPath(job, inputPaths[i]); } // Mapper job.setMapperClass(EstimateSplitPointsMapper.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); // Reducer job.setReducerClass(EstimateSplitPointsReducer.class); job.setOutputKeyClass(Key.class); job.setOutputValueClass(Value.class); job.setNumReduceTasks(1); // Output job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); // Run job job.waitForCompletion(true); // Successful? if (!job.isSuccessful()) { System.err.println("Error running job"); return 1; } // Number of records output // NB In the following line use mapred.Task.Counter.REDUCE_OUTPUT_RECORDS rather than // mapreduce.TaskCounter.REDUCE_OUTPUT_RECORDS as this is more compatible with earlier // versions of Hadoop. @SuppressWarnings("deprecation") Counter counter = job.getCounters() .findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS); long recordsOutput = counter.getValue(); System.out.println("Number of records output = " + recordsOutput); // Work out when to output a split point. The number of split points // needed is the number of tablet servers minus 1 (because you don't // have to output the start of the first tablet or the end of the // last tablet). long outputEveryNthRecord = recordsOutput / (numberTabletServers - 1); // Read through resulting file, pick out the split points and write to // file. FileSystem fs = FileSystem.get(conf); Path resultsFile = new Path(outputPath, "part-r-00000"); @SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader(fs, resultsFile, conf); PrintStream splitsWriter = new PrintStream(new BufferedOutputStream(fs.create(resultingSplitsFile, true))); Key key = new Key(); Value value = new Value(); long count = 0; int numberSplitPointsOutput = 0; while (reader.next(key, value) && numberSplitPointsOutput < numberTabletServers - 1) { count++; if (count % outputEveryNthRecord == 0) { numberSplitPointsOutput++; splitsWriter.println(new String(Base64.encodeBase64(key.getRow().getBytes()))); System.out.println("Written split point: " + key.getRow()); } } reader.close(); splitsWriter.close(); System.out.println("Number of split points output = " + numberSplitPointsOutput); return 0; }
From source file:gaffer.accumulostore.operation.hdfs.handler.job.tool.SampleDataAndCreateSplitsFileTool.java
License:Apache License
@Override public int run(final String[] strings) throws OperationException { try {/* w w w . j a v a 2 s . c o m*/ LOGGER.info("Creating job using SampleDataForSplitPointsJobFactory"); job = new SampleDataForSplitPointsJobFactory().createJob(operation, store); } catch (final IOException e) { LOGGER.error("Failed to create Hadoop job: {}", e.getMessage()); throw new OperationException("Failed to create the Hadoop job: " + e.getMessage(), e); } try { LOGGER.info("Running SampleDataForSplitPoints job (job name is {})", job.getJobName()); job.waitForCompletion(true); } catch (final IOException | InterruptedException | ClassNotFoundException e) { LOGGER.error("Exception running job: {}", e.getMessage()); throw new OperationException("Error while waiting for job to complete: " + e.getMessage(), e); } try { if (!job.isSuccessful()) { LOGGER.error("Job was not successful (job name is {})", job.getJobName()); throw new OperationException("Error running job"); } } catch (final IOException e) { LOGGER.error("Exception running job: {}", e.getMessage()); throw new OperationException("Error running job" + e.getMessage(), e); } // Find the number of records output // NB In the following line use mapred.Task.Counter.REDUCE_OUTPUT_RECORDS rather than // mapreduce.TaskCounter.REDUCE_OUTPUT_RECORDS as this is more compatible with earlier // versions of Hadoop. Counter counter; try { counter = job.getCounters().findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS); LOGGER.info("Number of records output = {}", counter); } catch (final IOException e) { LOGGER.error( "Failed to get counter org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS from job: {}", e.getMessage()); throw new OperationException( "Failed to get counter: " + org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS, e); } int numberTabletServers; try { numberTabletServers = store.getConnection().instanceOperations().getTabletServers().size(); LOGGER.info("Number of tablet servers is {}", numberTabletServers); } catch (final StoreException e) { LOGGER.error("Exception thrown getting number of tablet servers: {}", e.getMessage()); throw new OperationException(e.getMessage(), e); } long outputEveryNthRecord = counter.getValue() / (numberTabletServers - 1); final Path resultsFile = new Path(operation.getOutputPath(), "part-r-00000"); LOGGER.info("Will output every {}-th record from {}", outputEveryNthRecord, resultsFile); // Read through resulting file, pick out the split points and write to file. final Configuration conf = getConf(); final FileSystem fs; try { fs = FileSystem.get(conf); } catch (final IOException e) { LOGGER.error("Exception getting filesystem: {}", e.getMessage()); throw new OperationException("Failed to get filesystem from configuration: " + e.getMessage(), e); } LOGGER.info("Writing splits to {}", operation.getResultingSplitsFilePath()); final Key key = new Key(); final Value value = new Value(); long count = 0; int numberSplitPointsOutput = 0; try (final SequenceFile.Reader reader = new SequenceFile.Reader(fs, resultsFile, conf); final PrintStream splitsWriter = new PrintStream( new BufferedOutputStream(fs.create(new Path(operation.getResultingSplitsFilePath()), true)), false, CommonConstants.UTF_8)) { while (reader.next(key, value) && numberSplitPointsOutput < numberTabletServers - 1) { count++; if (count % outputEveryNthRecord == 0) { LOGGER.debug("Outputting split point number {} ({})", numberSplitPointsOutput, Base64.encodeBase64(key.getRow().getBytes())); numberSplitPointsOutput++; splitsWriter.println( new String(Base64.encodeBase64(key.getRow().getBytes()), CommonConstants.UTF_8)); } } LOGGER.info("Total number of records read was {}", count); } catch (final IOException e) { LOGGER.error("Exception reading results file and outputting split points: {}", e.getMessage()); throw new OperationException(e.getMessage(), e); } try { fs.delete(resultsFile, true); LOGGER.info("Deleted the results file {}", resultsFile); } catch (final IOException e) { LOGGER.error("Failed to delete the results file {}", resultsFile); throw new OperationException("Failed to delete the results file: " + e.getMessage(), e); } return SUCCESS_RESPONSE; }
From source file:gaffer.accumulostore.operation.hdfs.handler.tool.SampleDataAndCreateSplitsFileTool.java
License:Apache License
@Override public int run(final String[] strings) throws OperationException { try {//from w w w.j av a2s. c o m job = new SampleDataForSplitPointsJobFactory().createJob(operation, store); } catch (IOException e) { throw new OperationException("Failed to create the hadoop job : " + e.getMessage(), e); } try { job.waitForCompletion(true); } catch (IOException | InterruptedException | ClassNotFoundException e) { throw new OperationException("Erorr while waiting for job to complete : " + e.getMessage(), e); } try { if (!job.isSuccessful()) { throw new OperationException("Error running job"); } } catch (IOException e) { throw new OperationException("Error running job" + e.getMessage(), e); } // Number of records output // NB In the following line use mapred.Task.Counter.REDUCE_OUTPUT_RECORDS rather than // mapreduce.TaskCounter.REDUCE_OUTPUT_RECORDS as this is more compatible with earlier // versions of Hadoop. Counter counter; try { counter = job.getCounters().findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS); } catch (IOException e) { throw new OperationException( "Failed to get counter: " + org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS, e); } int numberTabletServers; try { numberTabletServers = store.getConnection().instanceOperations().getTabletServers().size(); } catch (StoreException e) { throw new OperationException(e.getMessage(), e); } long outputEveryNthRecord = counter.getValue() / (numberTabletServers - 1); // Read through resulting file, pick out the split points and write to file. Configuration conf = getConf(); FileSystem fs; try { fs = FileSystem.get(conf); } catch (IOException e) { throw new OperationException("Failed to get Filesystem from configuraiton : " + e.getMessage(), e); } Path resultsFile = new Path(operation.getInputPath(), "part-r-00000"); Key key = new Key(); Value value = new Value(); long count = 0; int numberSplitPointsOutput = 0; try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, resultsFile, conf); PrintStream splitsWriter = new PrintStream( new BufferedOutputStream(fs.create(operation.getResultingSplitsFilePath(), true)), false, CommonConstants.UTF_8)) { while (reader.next(key, value) && numberSplitPointsOutput < numberTabletServers - 1) { count++; if (count % outputEveryNthRecord == 0) { numberSplitPointsOutput++; splitsWriter.println( new String(Base64.encodeBase64(key.getRow().getBytes()), CommonConstants.UTF_8)); } } } catch (IOException e) { throw new OperationException(e.getMessage(), e); } try { fs.delete(resultsFile, true); } catch (IOException e) { throw new OperationException("Failed to delete the mapreduce result file : " + e.getMessage(), e); } return SUCCESS_RESPONSE; }
From source file:gobblin.compaction.event.CompactionSlaEventHelper.java
License:Apache License
private static long getRecordCount(Optional<Job> job) { if (!job.isPresent()) { return -1l; }/* w w w . j a v a2 s .c om*/ Counters counters = null; try { counters = job.get().getCounters(); } catch (IOException e) { LOG.debug("Failed to get job counters. Record count will not be set. ", e); return -1l; } Counter recordCounter = counters.findCounter(AvroKeyDedupReducer.EVENT_COUNTER.RECORD_COUNT); if (recordCounter != null && recordCounter.getValue() != 0) { return recordCounter.getValue(); } recordCounter = counters.findCounter(AvroKeyMapper.EVENT_COUNTER.RECORD_COUNT); if (recordCounter != null && recordCounter.getValue() != 0) { return recordCounter.getValue(); } LOG.debug("Non zero record count not found in both mapper and reducer counters"); return -1l; }
From source file:gobblin.runtime.mapreduce.MRJobLauncher.java
License:Apache License
/** * Create a {@link gobblin.metrics.GobblinMetrics} instance for this job run from the Hadoop counters. *///www. j av a 2s.c o m @VisibleForTesting void countersToMetrics(GobblinMetrics metrics) throws IOException { Optional<Counters> counters = Optional.fromNullable(this.job.getCounters()); if (counters.isPresent()) { // Write job-level counters CounterGroup jobCounterGroup = counters.get().getGroup(MetricGroup.JOB.name()); for (Counter jobCounter : jobCounterGroup) { metrics.getCounter(jobCounter.getName()).inc(jobCounter.getValue()); } // Write task-level counters CounterGroup taskCounterGroup = counters.get().getGroup(MetricGroup.TASK.name()); for (Counter taskCounter : taskCounterGroup) { metrics.getCounter(taskCounter.getName()).inc(taskCounter.getValue()); } } }
From source file:io.covert.dns.collection.CollectionJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { usage(""); }//from w ww. java 2 s . c om String dclass = args[0]; String types = args[1]; String inDir = args[2]; String outDir = args[3]; Configuration conf = getConf(); if (conf.get("dns.collection.num.resolvers") == null) conf.setInt("dns.collection.num.resolvers", 50); if (conf.get("dns.collection.nameservers") == null) conf.set("dns.collection.nameservers", "127.0.0.1"); Job job = new Job(conf); job.setJobName(CollectionJob.class.getSimpleName() + ": types=" + types + ", dclass=" + dclass + " inDir=" + inDir + ", outDir=" + outDir + ", resolvers=" + conf.get("dns.collection.nameservers")); job.setJarByClass(getClass()); job.setMapperClass(CollectionMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormatClass(DnsRequestInputFormat.class); DnsRequestInputFormat.setInputPaths(job, new Path(inDir)); DnsRequestInputFormat.configure(job, dclass.toUpperCase(), Arrays.asList(types.split(",")), Arrays.asList("")); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outDir)); SequenceFileOutputFormat.setCompressOutput(job, true); job.submit(); int retVal = job.waitForCompletion(true) ? 0 : 1; CounterGroup counters = job.getCounters().getGroup(CollectionMapper.RESOLVER_GROUP); Counter constructMessageMS = counters.findCounter(CollectionMapper.CONSTRUCT_MESSAGE_MS); Counter parseResponseMS = counters.findCounter(CollectionMapper.PARSE_RESPONSE_MS); Counter performRequestMS = counters.findCounter(CollectionMapper.PERFORM_REQUEST_MS); Counter totalRequestHandlingMS = counters.findCounter(CollectionMapper.TOTAL_REQUEST_HANDLING_MS); Log.info("Total ConstructMessage percent: " + (double) (constructMessageMS.getValue() * 100L) / ((double) totalRequestHandlingMS.getValue())); Log.info("Total ParseResponse percent: " + (double) (parseResponseMS.getValue() * 100L) / ((double) totalRequestHandlingMS.getValue())); Log.info("Total PerformRequest percent: " + (double) (performRequestMS.getValue() * 100L) / ((double) totalRequestHandlingMS.getValue())); return retVal; }
From source file:io.druid.indexer.IndexGeneratorJob.java
License:Apache License
public boolean run() { try {/*from w w w . jav a 2s. co m*/ Job job = Job.getInstance(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); JobHelper.injectSystemProperties(job); config.addJobProperties(job); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(BytesWritable.class); SortableBytes.useSortableBytesAsMapOutputKey(job); int numReducers = Iterables.size(config.getAllBuckets().get()); if (numReducers == 0) { throw new RuntimeException("No buckets?? seems there is no data to index."); } if (config.getSchema().getTuningConfig().getUseCombiner()) { job.setCombinerClass(IndexGeneratorCombiner.class); job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class); } job.setNumReduceTasks(numReducers); job.setPartitionerClass(IndexGeneratorPartitioner.class); setReducerClass(job); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); // hack to get druid.processing.bitmap property passed down to hadoop job. // once IndexIO doesn't rely on globally injected properties, we can move this into the HadoopTuningConfig. final String bitmapProperty = "druid.processing.bitmap.type"; final String bitmapType = HadoopDruidIndexerConfig.properties.getProperty(bitmapProperty); if (bitmapType != null) { for (String property : new String[] { "mapreduce.reduce.java.opts", "mapreduce.map.java.opts" }) { // prepend property to allow overriding using hadoop.xxx properties by JobHelper.injectSystemProperties above String value = Strings.nullToEmpty(job.getConfiguration().get(property)); job.getConfiguration().set(property, String.format("-D%s=%s %s", bitmapProperty, bitmapType, value)); } } config.intoConfiguration(job); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), job); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:kogiri.mapreduce.preprocess.indexing.stage3.KmerStatisticsBuilder.java
License:Open Source License
private int runJob(PreprocessorConfig ppConfig) throws Exception { // check config validatePreprocessorConfig(ppConfig); // configuration Configuration conf = this.getConf(); // set user configuration ppConfig.getClusterConfiguration().configureTo(conf); ppConfig.saveTo(conf);/*from w ww. j av a2s.c o m*/ Path[] inputFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, ppConfig.getKmerIndexPath()); for (Path inputFile : inputFiles) { LOG.info(inputFile); } boolean job_result = true; List<Job> jobs = new ArrayList<Job>(); for (int round = 0; round < inputFiles.length; round++) { Path roundInputFile = inputFiles[round]; Path[] roundInputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, roundInputFile); Job job = new Job(conf, "Kogiri Preprocessor - Computing Kmer Statistics (" + round + " of " + inputFiles.length + ")"); job.setJarByClass(KmerStatisticsBuilder.class); // Mapper job.setMapperClass(KmerStatisticsBuilderMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); // Specify key / value job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); // Inputs Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf, roundInputKmerIndexPartFiles); SequenceFileInputFormat.addInputPaths(job, FileSystemHelper.makeCommaSeparated(kmerIndexPartDataFiles)); LOG.info("Input file : "); LOG.info("> " + roundInputFile.toString()); // Outputs job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); // Execute job and return status boolean result = job.waitForCompletion(true); jobs.add(job); // check results if (result) { CounterGroup uniqueGroup = job.getCounters() .getGroup(KmerStatisticsHelper.getCounterGroupNameUnique()); CounterGroup totalGroup = job.getCounters() .getGroup(KmerStatisticsHelper.getCounterGroupNameTotal()); CounterGroup squareGroup = job.getCounters() .getGroup(KmerStatisticsHelper.getCounterGroupNameSquare()); CounterGroup logTFSquareGroup = job.getCounters() .getGroup(KmerStatisticsHelper.getCounterGroupNameLogTFSquare()); Iterator<Counter> uniqueIterator = uniqueGroup.iterator(); while (uniqueIterator.hasNext()) { long count = 0; long length = 0; long square = 0; double logTFSquare = 0; double real_mean = 0; double stddev = 0; double tf_cosnorm_base = 0; Counter uniqueCounter = uniqueIterator.next(); Counter totalCounter = totalGroup.findCounter(uniqueCounter.getName()); Counter squareCounter = squareGroup.findCounter(uniqueCounter.getName()); Counter logTFSquareCounter = logTFSquareGroup.findCounter(uniqueCounter.getName()); count = uniqueCounter.getValue(); length = totalCounter.getValue(); square = squareCounter.getValue(); logTFSquare = logTFSquareCounter.getValue() / 1000.0; tf_cosnorm_base = Math.sqrt(logTFSquare); real_mean = (double) length / (double) count; // stddev = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2) double mean = Math.pow(real_mean, 2); double term = (double) square / (double) count; stddev = Math.sqrt(term - mean); LOG.info("distinct k-mers " + uniqueCounter.getName() + " : " + count); LOG.info("total k-mers " + uniqueCounter.getName() + " : " + length); LOG.info("average " + uniqueCounter.getName() + " : " + real_mean); LOG.info("std-deviation " + uniqueCounter.getName() + " : " + stddev); LOG.info("tf-cos-norm-base " + uniqueCounter.getName() + " : " + tf_cosnorm_base); Path outputHadoopPath = new Path(ppConfig.getKmerStatisticsPath(), KmerStatisticsHelper.makeKmerStatisticsFileName(uniqueCounter.getName())); FileSystem fs = outputHadoopPath.getFileSystem(conf); KmerStatistics statistics = new KmerStatistics(); statistics.setSampleName(uniqueCounter.getName()); statistics.setKmerSize(ppConfig.getKmerSize()); statistics.setUniqueKmers(count); statistics.setTotalKmers(length); statistics.setAverageFrequency(real_mean); statistics.setStdDeviation(stddev); statistics.setTFCosineNormBase(tf_cosnorm_base); statistics.saveTo(fs, outputHadoopPath); } } if (!result) { LOG.error("job failed at round " + round + " of " + inputFiles.length); job_result = false; break; } } // report if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) { Report report = new Report(); report.addJob(jobs); report.writeTo(ppConfig.getReportPath()); } return job_result ? 0 : 1; }
From source file:libra.preprocess.stage2.KmerIndexBuilder.java
License:Apache License
private void createStatisticsOfIndex(Path statisticsPath, Path inputPath, Configuration conf, Counters counters, int kmerSize) throws IOException { CounterGroup logTFSquareGroup = counters.getGroup(KmerStatisticsHelper.getCounterGroupNameLogTFSquare()); Iterator<Counter> logTFSquareGroupIterator = logTFSquareGroup.iterator(); while (logTFSquareGroupIterator.hasNext()) { Counter logTFSquareCounter = logTFSquareGroupIterator.next(); if (logTFSquareCounter.getName().equals(inputPath.getName())) { double logTFSquare = 0; double tf_cosnorm_base = 0; logTFSquare = logTFSquareCounter.getValue() / 1000.0; tf_cosnorm_base = Math.sqrt(logTFSquare); LOG.info("tf-cos-norm-base " + logTFSquareCounter.getName() + " : " + tf_cosnorm_base); Path outputHadoopPath = new Path(statisticsPath, KmerStatisticsHelper.makeKmerStatisticsFileName(logTFSquareCounter.getName())); FileSystem fs = outputHadoopPath.getFileSystem(conf); KmerStatistics statistics = new KmerStatistics(); statistics.setSampleName(logTFSquareCounter.getName()); statistics.setKmerSize(kmerSize); statistics.setTFCosineNormBase(tf_cosnorm_base); statistics.saveTo(fs, outputHadoopPath); }/* ww w. j a va 2 s . co m*/ } }