List of usage examples for org.apache.hadoop.fs Path SEPARATOR
String SEPARATOR
To view the source code for org.apache.hadoop.fs Path SEPARATOR.
Click Source Link
From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java
License:Open Source License
/** * Writes out the matrix in row major (packed) order. No labels are outputed. * * @param jobConf//from w ww . j a v a2 s . c o m * @param input * @param output * @param digits * @throws IOException */ public static void printRowMajorMatrix(JobConf jobConf, String input, String output, int digits) throws IOException { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); DecimalFormat format = new DecimalFormat(); format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US)); format.setMinimumIntegerDigits(1); format.setMaximumFractionDigits(digits); //format.setMinimumFractionDigits(fractionDigits); format.setGroupingUsed(false); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); final Path outputPath = new Path(output); Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*"); FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing! final Writer writer = new OutputStreamWriter(fos); final Text key = new Text(); final DenseVectorWritable value = new DenseVectorWritable(); for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); boolean hasNext = reader.next(key, value); while (hasNext) { final DenseVector vector = value.get(); final StringBuilder sb = new StringBuilder(); for (int i = 0; i < vector.getCardinality(); i++) { final String s = format.format(vector.get(i)); // format the number sb.append(s); sb.append(' '); } writer.write(sb.toString()); hasNext = reader.next(key, value); } try { writer.flush(); reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } } try { writer.close(); fos.flush(); fos.close(); } catch (IOException ioe) { LOG.debug("Caused by distributed cache output stream.", ioe); } }
From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java
License:Open Source License
/** * Outputs the distance matrix (DenseVectors) in Phylip Square format. Names/labels are limited to 10-characters! * * @param jobConf//from w w w. j a v a 2s. co m * @param input input directory name containing DenseVectors (as generated by this class). * @param output output file name * @param fractionDigits number of digits after decimal point * @throws IOException */ public static void printPhylipSquare(JobConf jobConf, String input, String output, int fractionDigits) throws IOException { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); DecimalFormat format = new DecimalFormat(); format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US)); format.setMinimumIntegerDigits(1); format.setMaximumFractionDigits(fractionDigits); //format.setMinimumFractionDigits(fractionDigits); format.setGroupingUsed(false); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); final Path outputPath = new Path(output); Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*"); FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing! Writer writer = new OutputStreamWriter(fos); Text key = new Text(); DenseVectorWritable value = new DenseVectorWritable(); Boolean wroteHeader = false; for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); boolean hasNext = reader.next(key, value); while (hasNext) { final DenseVector vector = value.get(); if (!wroteHeader) { writer.write(String.format("\t%d\n", vector.getCardinality())); wroteHeader = true; } final StringBuilder sb = new StringBuilder(); final String name = key.toString(); sb.append(name.substring(0, (name.length() > 10 ? 10 : name.length()))); final int padding = Math.max(1, 10 - name.length()); for (int k = 0; k < padding; k++) { sb.append(' '); } sb.append(' '); for (int i = 0; i < vector.getCardinality(); i++) { final String s = format.format(vector.get(i)); // format the number sb.append(s); sb.append(' '); } sb.append("\n"); writer.write(sb.toString()); hasNext = reader.next(key, value); } try { writer.flush(); reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } } try { writer.close(); fos.flush(); fos.close(); } catch (IOException ioe) { LOG.debug("Caused by distributed cache output stream.", ioe); } }
From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java
License:Open Source License
public int initJob(JobConf jobConf, String input, String output) throws Exception { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); /**//from www . j a v a2 s . c o m * Need to get all of the sample names/labels */ JobConf cacheConf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); cacheConf.setJobName("CacheNorm2MapReduce"); cacheConf.setNumReduceTasks(1); // Want ONE part file // Set up IdentityMapper SequenceFileInputFormat.setInputPaths(cacheConf, new Path(input)); cacheConf.setInputFormat(SequenceFileInputFormat.class); cacheConf.setMapperClass(Norm2Mapper.class); cacheConf.setOutputKeyClass(StringDoublePairWritable.class); cacheConf.setOutputValueClass(SparseVectorWritable.class); // Set up IdentityReducer cacheConf.setReducerClass(IdentityReducer.class); cacheConf.setOutputFormat(SequenceFileOutputFormat.class); cacheConf.setNumReduceTasks(1); Path sfPath = FileUtils.createRemoteTempPath(fs, qInputPath.getParent()); LOG.info(String.format("Generating feature vector SequenceFile path %s", sfPath.toString())); SequenceFileOutputFormat.setOutputPath(cacheConf, sfPath); JobClient.runJob(cacheConf); Path cachePath = new Path(sfPath.toString() + Path.SEPARATOR + "part-00000"); // need to know the size (the reducer might be able to send this back via the Reporter, but how do we grab that info? StringDoublePairWritable key = new StringDoublePairWritable(); int size = 0; SequenceFile.Reader reader = new SequenceFile.Reader(fs, cachePath, conf); boolean hasNext = reader.next(key); while (hasNext) { size += 1; hasNext = reader.next(key); } try { reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } //LOG.info(String.format("Caching model file %s", qInputPath.toString())); URI listURI = new URI(fs.makeQualified(cachePath).toString()); DistributedCache.addCacheFile(listURI, conf); LOG.info(String.format("SequenceFile cache path %s (%s) with %d labels", listURI.toString(), cachePath.getName(), size)); conf.set(CACHE_PATH, cachePath.getName()); conf.setInt(DISTANCE_MATRIX_SIZE, size); /** * Main MapReduce Task of generating dot products */ LOG.info("Generating distances"); JobConf distanceConf = new JobConf(conf, CalculateCosineDistanceMatrix.class); distanceConf.setJobName("DistanceMapReduce"); // Set up distance mapper SequenceFileInputFormat.setInputPaths(distanceConf, new Path(input)); distanceConf.setInputFormat(SequenceFileInputFormat.class); distanceConf.setMapperClass(DistanceMap.class); distanceConf.setMapOutputKeyClass(Text.class); distanceConf.setMapOutputValueClass(SparseVectorWritable.class); // Set up reducer to merge lower-triangle results into a single dense distance vector distanceConf.setReducerClass(DistanceReducer.class); distanceConf.setOutputKeyClass(Text.class); distanceConf.setOutputValueClass(DenseVectorWritable.class); distanceConf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(distanceConf, new Path(output)); JobClient.runJob(distanceConf); return 0; }
From source file:org.mitre.ccv.mapred.CompleteCompositionVectors.java
License:Open Source License
/** * * The JSO data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features * will be in a different order. This version, by default sorts, only by entropy values, whereas the * ccv in-memory version sorts by the k-mer natural order (i.e., lexigraphic). * @param argv// w w w . j av a2 s . co m * @return * @throws java.lang.Exception */ @Override @SuppressWarnings("static-access") // For OptionBuilder public int run(String[] argv) throws Exception { JobConf conf = new JobConf(getConf()); String cli_title = "CompleteCompositionVectorHadoop"; int start = CalculateKmerCounts.DEFAULT_START; int end = CalculateKmerCounts.DEFAULT_END; int topkmers = 0; String input = null; String output = null; String vectorJsonOutput = null; //String kmerJsonOutput = null; boolean cleanLogs = false; /** create the Options */ Options options = new Options(); /** Hadoop Options */ options.addOption( OptionBuilder.withArgName("number").hasArg(true).withDescription("number of maps").create("m")); options.addOption( OptionBuilder.withArgName("number").hasArg(true).withDescription("number of reducers").create("r")); // org.hadoop.util.GenericOptionsParser should captures this, but it doesn't options.addOption(OptionBuilder.withArgName("property=value").hasArg(true).withValueSeparator() .withDescription("use value for given property").create("D")); /** CompleteCompositionVector Options */ options.addOption(OptionBuilder.withArgName("number").hasArg(true) .withDescription("number of top k-mers to use in calculations").create("topKmers")); options.addOption(OptionBuilder.withArgName("start").hasArg(true).withDescription("starting length of tile") .create("start")); options.addOption(OptionBuilder.withArgName("end").hasArg(true).withDescription("ending length of title") .create("end")); options.addOption(OptionBuilder.hasArg(true).withArgName("file") .withDescription("JSON file to write out k-mers to").create("kmersfile")); options.addOption(OptionBuilder.hasArg(true).withArgName("file") .withDescription("JSON file to write out feature vectors to " + "(Overrides kmersout, only one file will be written).") .create("vectorsfile")); options.addOption(OptionBuilder.withArgName("number").hasArg(true) .withDescription("What preference to use: 0-min 1-median 2-avg(min,med): default is median") .create("prefval")); options.addOption(OptionBuilder.withArgName("help").hasArg(false).withDescription("print this message") .create("help")); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); //GenericOptionsParser gop = new GenericOptionsParser(conf, options, argv); GenericOptionsParser gop = new GenericOptionsParser(conf, argv); String[] remaining_args = gop.getRemainingArgs(); // create the parser CommandLineParser parser = new GnuParser(); //CommandLine line = gop.getCommandLine(); String[] other_args = new String[] {}; try { CommandLine line = parser.parse(options, remaining_args); other_args = line.getArgs(); // Make sure there is a parameter left. if (other_args.length == 0) { System.out.println(cli_title); System.out.println("Missing input path!"); formatter.printHelp("hccv [options] <input> [<output>] ", options); GenericOptionsParser.printGenericCommandUsage(System.out); return -1; } Option[] opts = line.getOptions(); if (line.hasOption("help")) { System.out.println(cli_title); formatter.printHelp("hccv [options] <input> [<output>] ", options); GenericOptionsParser.printGenericCommandUsage(System.out); return -1; } // could also use line.iterator() for (Option opt : opts) { if (opt.getOpt().equals("m")) { conf.setNumMapTasks(Integer.parseInt(opt.getValue())); } if (opt.getOpt().equals("r")) { conf.setNumReduceTasks(Integer.parseInt(opt.getValue())); } if (opt.getOpt().equals("D")) { // We can have multiple properties we want to set String[] properties = opt.getValues(); for (String property : properties) { String[] keyval = property.split("="); conf.set(keyval[0], keyval[1]); } } if (opt.getOpt().equals("start")) { start = Integer.parseInt(opt.getValue()); } if (opt.getOpt().equals("end")) { end = Integer.parseInt(opt.getValue()); } if (opt.getOpt().equals("topKmers")) { topkmers = Integer.parseInt(opt.getValue()); } if (opt.getOpt().equals("vectorsfile")) { vectorJsonOutput = opt.getValue(); } } } catch (ParseException e) { LOG.warn("options parsing faild: " + e.getMessage()); System.out.println(cli_title); formatter.printHelp("hccv [options] <input> [<output>] ", options); GenericOptionsParser.printGenericCommandUsage(System.out); } if (start <= 2) { throw new IllegalArgumentException("Value of 'start' argument must be larger than 2"); } input = other_args[0]; if (other_args.length < 2) { output = input + "_" + FileUtils.getSimpleDate(); } else { output = other_args[2]; } /** * Check output path. Either needs to exist as a directory or not exist */ Path outputPath = new Path(output); FileSystem fs = outputPath.getFileSystem(conf); if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); } else if (fs.exists(outputPath) || !fs.getFileStatus(outputPath).isDir()) { LOG.fatal(String.format("Output directory %s already exists", outputPath.makeQualified(fs))); throw new FileAlreadyExistsException( String.format("Output directory %s already exists", outputPath.makeQualified(fs))); } String outputDir = output + Path.SEPARATOR; int res; /** * Zero, CalculateCompositionVectors */ LOG.info("Starting CalculateCompositionVectors Map-Reduce job"); CalculateCompositionVectors cv = new CalculateCompositionVectors(); res = cv.initJob(conf, start, end, input, outputDir + COMPOSITION_VECTORS, cleanLogs); if (res != 0) { LOG.info("CalculateCompositionVectors returned non-zero result!"); return res; } // We can stop now or continue to reduce dimensionallity using RRE or other means /** * First, CalculateKmerCounts */ LOG.info("Starting CalculateKmerCounts Map-Reduce job"); // FastMap option for CalculateKmers!?! CalculateKmerCounts ckc = new CalculateKmerCounts(); res = ckc.initJob(conf, start, end, input, outputDir + KMER_COUNTS); if (res != 0) { LOG.fatal("CalculateKmerCounts returned non-zero result!"); return res; } /** * Second, TotalSequenceLength */ LOG.info("Starting TotalSequenceLength Map-Reduce job"); TotalSequenceLength tsl = new TotalSequenceLength(); res = tsl.initJob(conf, input, outputDir + TOTAL_LENGTH, cleanLogs); if (res != 0) { LOG.fatal("TotalSequenceLength returned non-zero result!"); return res; } int length = tsl.getCount(conf, outputDir + TOTAL_LENGTH); if (length < 3) { LOG.fatal("TotalSequenceLength returned a total sequence length of less than 3."); return -1; } else { LOG.info(String.format("TotalSequenceLength returned a total sequence length of %d.", length)); } /** * Third, CalculateKmerProbabilities */ LOG.info("Starting CalculateKmerProbabilities Map-Reduce job"); CalculateKmerProbabilities ckp = new CalculateKmerProbabilities(); res = ckp.initJob(conf, start, end, length, outputDir + KMER_COUNTS, outputDir + KMER_PROBABILITIES, cleanLogs); if (res != 0) { LOG.fatal("CalculateKmerProbabilities returned non-zero result!"); return res; } /** * Fourth, InvertKmerProbabilities */ LOG.info("Starting InvertKmerProbabilities Map-Reduce job"); InvertKmerProbabilities ikp = new InvertKmerProbabilities(); res = ikp.initJob(conf, outputDir + KMER_PROBABILITIES, outputDir + INVERTED_KMER_PROBABILITIES, cleanLogs); if (res != 0) { LOG.fatal("InvertKmerProbabilities returned non-zero result!"); return res; } /** * Fifth, CalculateKmerPiValues */ LOG.info("Starting CalculateKmerPiValues Map-Reduce job"); CalculateKmerPiValues kpv = new CalculateKmerPiValues(); res = kpv.initJob(conf, start, end, outputDir + INVERTED_KMER_PROBABILITIES, outputDir + KMER_PI_VALUES, cleanLogs); if (res != 0) { LOG.fatal("CalculateKmerPiValues returned non-zero result!"); return res; } /** * Sixth,CalculateKmerRevisedRelativeEntropy */ LOG.info("Starting CalculateKmerRevisedRelativeEntropy Map-Reduce job"); CalculateKmerRevisedRelativeEntropy krre = new CalculateKmerRevisedRelativeEntropy(); res = krre.initJob(conf, outputDir + KMER_PI_VALUES, outputDir + COMPOSITION_VECTORS, outputDir + ENTROPY_VALUES, cleanLogs); if (res != 0) { LOG.fatal("CalculateKmerRevisedRelativeEntropy returned non-zero result!"); return res; } /** * Seventh, SortKmerRevisedRelativeEntropies */ SortKmerRevisedRelativeEntropies srre = new SortKmerRevisedRelativeEntropies(); res = srre.initJob(conf, outputDir + ENTROPY_VALUES, outputDir + SORTED_ENTROPY_VALUES, cleanLogs); if (res != 0) { LOG.fatal("SortKmerRevisedRelativeEntropies returned non-zero result!"); return res; } /** * Eigth, GenerateFeatureVectors * * Generate a flatten list to add to the cache to be distributed to the map-tasks. */ Path listOutputPath = new Path(outputDir + Integer.toString(topkmers) + KMER_ENTROPY_SET); LOG.info(String.format("Loading %d sorted k-mers from %s to %s", topkmers, outputDir + SORTED_ENTROPY_VALUES, listOutputPath.toString())); int num = CompleteCompositionVectorUtils.flattenKmerEntropySequenceFile(conf, topkmers, outputDir + SORTED_ENTROPY_VALUES, listOutputPath.toString(), cleanLogs); if (num != topkmers) { LOG.fatal(String.format("Requested %d k-mers, but got %d. Using %d", topkmers, num, num)); topkmers = num; } GenerateFeatureVectors fv = new GenerateFeatureVectors(); res = fv.initJob(conf, listOutputPath.toString(), topkmers, outputDir + COMPOSITION_VECTORS, outputDir + FEATURE_VECTORS, cleanLogs); if (res != 0) { LOG.fatal("GenerateFeatureVectors returned non-zero result!"); return res; } /** * Save feature vectors, features (k-mers), and properties to a JSON file. * * The data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features * will be in a different order. This version, by default sorts, only by entropy values, whereas the * ccv in-memory version sorts by the k-mer natural order (i.e., lexigraphic). */ if (vectorJsonOutput != null && vectorJsonOutput.length() > 0) { LOG.info("Writing features out to " + vectorJsonOutput); CompleteCompositionVectorUtils.featureVectors2Json(conf, start, end, topkmers, outputDir + SORTED_ENTROPY_VALUES, outputDir + FEATURE_VECTORS, vectorJsonOutput); } LOG.info("All done generating complete composition vectors and feature vectors."); return res; }
From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java
License:Open Source License
/** * Returns the given number of k-mers from {@link SequenceFile}s containing {@link KmerEntropyPairWritable} as the keys. * * @param conf JobConf/*from w ww .ja v a 2s . c o m*/ * @param input path to SequenceFile * @param numKmers the number of k-mers to return (if null or 0, all will be returned). * @return {@link TreeSet} of sorted (see {@link KmerEntropyPairWritable} k-mers. * @throws java.io.IOException */ public static TreeSet<String> getKmerEntropiesFromSequenceFile(JobConf conf, String input, Integer length) throws IOException { TreeSet<String> nmers = new TreeSet<String>(); Path inputPath = new Path(input); FileSystem fs = inputPath.getFileSystem(conf); //Path inputPath = fs.makeQualified(path); Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*"); if (length == null || length <= 0) { length = Integer.MAX_VALUE; } int cnt = 0; for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); KmerEntropyPairWritable key = new KmerEntropyPairWritable(); boolean hasNext = true; while (hasNext && cnt < length) { hasNext = reader.next(key); nmers.add(key.getKey()); cnt++; } } return nmers; }
From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java
License:Open Source License
/** * Flattens a {@link SequenceFile} containing {@link KmerEntropyPairWritable}s as keys to a file * containing only the keys as {@link KmerEntropyPairWritable} in the same order. * * @param conf//from w w w . java2 s . com * @param numKmers * @param input the input path containing the kmers. * @param output the output file path to write the keys to. * @param asText if <code>true</code>, then save keys and values as text. Otherwise, save as {@link Writable}s * @return the actual number written out. * @throws java.io.IOException */ public static synchronized int flattenKmerEntropySequenceFile(JobConf conf, int numKmers, String input, String output, boolean asText) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug(String.format("Flattening %d k-mers entropies from %s to %s", numKmers, input, output)); } Path outPath = new Path(output); FileSystem fs = outPath.getFileSystem(conf); FSDataOutputStream fos = fs.create(outPath, true); // throws nothing! Path inputPath = new Path(input); Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*"); if (numKmers <= 0) { numKmers = Integer.MAX_VALUE; } int cnt = 0; for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); KmerEntropyPairWritable key = new KmerEntropyPairWritable(); boolean hasNext = true; while (hasNext && cnt < numKmers) { hasNext = reader.next(key); if (asText) { fos.writeUTF(key.toString()); } else { key.write(fos); } cnt++; } try { fos.close(); reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader", ioe); } } return cnt; }
From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java
License:Open Source License
/** * Writes out the {@link SequenceFile} feature vectors in row major (packed) order. No labels are outputed. * * @param jobConf//from w ww . ja v a 2 s . c o m * @param input top level SequenceFile directory path * @param output path to output the matrix * @param digits the maximum number of fraction digits * @throws IOException */ public static void featureVectors2RowMajorMatrix(JobConf jobConf, String input, String output, int digits) throws IOException { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); DecimalFormat format = new DecimalFormat(); format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US)); format.setMinimumIntegerDigits(1); format.setMaximumFractionDigits(digits); //format.setMinimumFractionDigits(fractionDigits); format.setGroupingUsed(false); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); final Path outputPath = new Path(output); Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*"); FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing! final Writer writer = new OutputStreamWriter(fos); final Text key = new Text(); final SparseVectorWritable value = new SparseVectorWritable(); for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); boolean hasNext = reader.next(key, value); while (hasNext) { final SparseVector vector = value.get(); final StringBuilder sb = new StringBuilder(); for (int i = 0; i < vector.getCardinality(); i++) { final String s = format.format(vector.get(i)); // format the number sb.append(s); sb.append(' '); } writer.write(sb.toString()); hasNext = reader.next(key, value); } try { writer.flush(); reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } } try { writer.close(); fos.flush(); fos.close(); } catch (IOException ioe) { LOG.debug("Caused by distributed cache output stream.", ioe); } }
From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java
License:Open Source License
/** * Flattens a {@link SequenceFile} containing {@link KmerEntropyPairWritable}s as keys to a json file * containing the k-mers (<tt>features</tt>) in the same order, along with the start and end window sizes. * * @param conf/* ww w. j a v a2 s .c o m*/ * @param numKmers the number of k-mers to return (if 0 or less, all will be returned). * @param input the input path containing the kmers. * @param output the output file path to write the json file to. * @return the actual number of kmers written out * @throws java.io.IOException */ public static int kmerSequenceFile2Json(JobConf conf, int start, int end, int numKmers, String input, String output) throws IOException { Path outPath = new Path(output); FileSystem fs = outPath.getFileSystem(conf); FSDataOutputStream fos = fs.create(outPath, true); // throws nothing! Path inputPath = new Path(input); Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*"); if (numKmers <= 0) { numKmers = Integer.MAX_VALUE; } int cnt = 0; Writer writer = new OutputStreamWriter(fos); JsonFactory jf = new JsonFactory(); JsonGenerator jg = jf.createJsonGenerator(writer); CompleteCompositionVectorUtils util = new CompleteCompositionVectorUtils(); try { jg.writeStartObject(); util.writeJsonCcvProperties(jg, start, end); cnt = util.writeJsonKmers(conf, fs, paths, jg, numKmers); jg.writeEndObject(); jg.close(); writer.close(); } catch (JsonGenerationException ex) { LOG.error("Unable to write the nmers to a json object", ex); } return cnt; }
From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java
License:Open Source License
/** * Write out feature vectors, features (k-mers), and properties (start, end) to a JSON file. * <P>JSON format//ww w .j a v a 2 s .c om * <blockquote> * { * "properties" : * { * "begin" : 3 * "end" : 9 * } * "features" : [..] * "samples" : * [ * { * "name" : "sample name", * "data" : { nmer_index: non-zero pi-values } * }, .... * ] * } * </blockquote> * * The data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features * will be in a different order. The mapred version, by default sorts, only by entropy value, whereas the * ccv in-memory version sorts by the k-mer natural order (lexigraphic). * * @see {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix} * * @param conf the job configuration * @param start begining window size * @param end ending window size * @param numKmers the number of k-mers to return (if 0 or less, all will be returned). * @param listInput {@link SequenceFile} path containing k-mers used to generate the feature vectors. * @param featureInput {@link SequenceFile} path contains feature vectors {@link SparseVectorWritable}. * @param output the output file path to write the json file to. * @return the actual number of kmers written out (not samples/feature vectors) * @throws java.io.IOException */ public static int featureVectors2Json(JobConf conf, int start, int end, int numKmers, String listInput, String featureInput, String output) throws IOException { Path outPath = new Path(output); FileSystem fs = outPath.getFileSystem(conf); FSDataOutputStream fos = fs.create(outPath, true); // throws nothing! if (numKmers <= 0) { numKmers = Integer.MAX_VALUE; } Writer writer = new OutputStreamWriter(fos); JsonFactory jf = new JsonFactory(); JsonGenerator jg = jf.createJsonGenerator(writer); CompleteCompositionVectorUtils util = new CompleteCompositionVectorUtils(); int cnt = 0; try { jg.writeStartObject(); util.writeJsonCcvProperties(jg, start, end); /** Get k-mers (features) */ Path inputPath = new Path(listInput); Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*"); cnt = util.writeJsonKmers(conf, fs, paths, jg, numKmers); /** Get samples */ inputPath = new Path(featureInput); paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*"); util.jsonCcvVectors(conf, fs, paths, jg); jg.writeEndObject(); jg.close(); writer.close(); } catch (JsonGenerationException ex) { LOG.error("Unable to write the nmers to a json object", ex); } return cnt; }
From source file:org.mitre.ccv.mapred.SortKmerRevisedRelativeEntropies.java
License:Open Source License
/** * Returns the given number of k-mers from {@link SequenceFile}s generated by this class. * * @param conf/* w w w .j a va 2s . c o m*/ * @param input the path containing the <code>SequenceFile</code> parts. * @param m the number of k-mers to return. If <= 0, then {@link Integer.MAX_VALUE} is returned. * @return * @throws java.io.IOException */ static TreeSet<String> getkmers(JobConf conf, String input, Integer m) throws IOException { TreeSet<String> nmers = new TreeSet<String>(); Path inputPath = new Path(input); FileSystem fs = inputPath.getFileSystem(conf); //Path inputPath = fs.makeQualified(path); Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*"); if (m <= 0) { m = Integer.MAX_VALUE; } int cnt = 0; for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); KmerEntropyPairWritable key = new KmerEntropyPairWritable(); boolean hasNext = true; while (hasNext && cnt < m) { hasNext = reader.next(key); nmers.add(key.getKey()); cnt++; } } return nmers; }