Example usage for org.apache.hadoop.fs Path SEPARATOR

List of usage examples for org.apache.hadoop.fs Path SEPARATOR

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path SEPARATOR.

Prototype

String SEPARATOR

To view the source code for org.apache.hadoop.fs Path SEPARATOR.

Click Source Link

Document

The directory separator, a slash.

Usage

From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java

License:Open Source License

/**
 * Writes out the matrix in row major (packed) order. No labels are outputed.
 *
 * @param jobConf//from w  ww .  j  a  v  a2 s  .  c  o  m
 * @param input
 * @param output
 * @param digits
 * @throws IOException
 */
public static void printRowMajorMatrix(JobConf jobConf, String input, String output, int digits)
        throws IOException {
    JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);

    DecimalFormat format = new DecimalFormat();
    format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US));
    format.setMinimumIntegerDigits(1);
    format.setMaximumFractionDigits(digits);
    //format.setMinimumFractionDigits(fractionDigits);
    format.setGroupingUsed(false);

    final Path inputPath = new Path(input);
    final FileSystem fs = inputPath.getFileSystem(conf);
    final Path qInputPath = fs.makeQualified(inputPath);
    final Path outputPath = new Path(output);
    Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*");

    FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing!
    final Writer writer = new OutputStreamWriter(fos);
    final Text key = new Text();
    final DenseVectorWritable value = new DenseVectorWritable();
    for (int idx = 0; idx < paths.length; idx++) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
        boolean hasNext = reader.next(key, value);
        while (hasNext) {

            final DenseVector vector = value.get();
            final StringBuilder sb = new StringBuilder();
            for (int i = 0; i < vector.getCardinality(); i++) {
                final String s = format.format(vector.get(i)); // format the number
                sb.append(s);
                sb.append(' ');
            }
            writer.write(sb.toString());
            hasNext = reader.next(key, value);
        }
        try {
            writer.flush();
            reader.close();
        } catch (IOException ioe) {
            // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
            LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe);
        }
    }
    try {
        writer.close();
        fos.flush();
        fos.close();
    } catch (IOException ioe) {
        LOG.debug("Caused by distributed cache output stream.", ioe);
    }
}

From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java

License:Open Source License

/**
 * Outputs the distance matrix (DenseVectors) in Phylip Square format. Names/labels are limited to 10-characters!
 *
 * @param jobConf//from w w w.  j a  v  a  2s. co m
 * @param input             input directory name containing DenseVectors (as generated by this class).
 * @param output            output file name
 * @param fractionDigits    number of digits after decimal point
 * @throws IOException
 */
public static void printPhylipSquare(JobConf jobConf, String input, String output, int fractionDigits)
        throws IOException {
    JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);

    DecimalFormat format = new DecimalFormat();
    format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US));
    format.setMinimumIntegerDigits(1);
    format.setMaximumFractionDigits(fractionDigits);
    //format.setMinimumFractionDigits(fractionDigits);
    format.setGroupingUsed(false);

    final Path inputPath = new Path(input);
    final FileSystem fs = inputPath.getFileSystem(conf);
    final Path qInputPath = fs.makeQualified(inputPath);
    final Path outputPath = new Path(output);
    Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*");

    FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing!
    Writer writer = new OutputStreamWriter(fos);
    Text key = new Text();
    DenseVectorWritable value = new DenseVectorWritable();
    Boolean wroteHeader = false;
    for (int idx = 0; idx < paths.length; idx++) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
        boolean hasNext = reader.next(key, value);
        while (hasNext) {

            final DenseVector vector = value.get();
            if (!wroteHeader) {
                writer.write(String.format("\t%d\n", vector.getCardinality()));
                wroteHeader = true;
            }

            final StringBuilder sb = new StringBuilder();
            final String name = key.toString();
            sb.append(name.substring(0, (name.length() > 10 ? 10 : name.length())));
            final int padding = Math.max(1, 10 - name.length());
            for (int k = 0; k < padding; k++) {
                sb.append(' ');
            }
            sb.append(' ');
            for (int i = 0; i < vector.getCardinality(); i++) {
                final String s = format.format(vector.get(i)); // format the number
                sb.append(s);
                sb.append(' ');
            }
            sb.append("\n");
            writer.write(sb.toString());
            hasNext = reader.next(key, value);
        }
        try {
            writer.flush();
            reader.close();
        } catch (IOException ioe) {
            // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
            LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe);
        }
    }
    try {
        writer.close();
        fos.flush();
        fos.close();
    } catch (IOException ioe) {
        LOG.debug("Caused by distributed cache output stream.", ioe);
    }
}

From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java

License:Open Source License

public int initJob(JobConf jobConf, String input, String output) throws Exception {
    JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);

    final Path inputPath = new Path(input);
    final FileSystem fs = inputPath.getFileSystem(conf);
    final Path qInputPath = fs.makeQualified(inputPath);

    /**//from  www  .  j a v  a2 s  . c o  m
     * Need to get all of the sample names/labels
     */
    JobConf cacheConf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);
    cacheConf.setJobName("CacheNorm2MapReduce");
    cacheConf.setNumReduceTasks(1); // Want ONE part file

    // Set up IdentityMapper
    SequenceFileInputFormat.setInputPaths(cacheConf, new Path(input));
    cacheConf.setInputFormat(SequenceFileInputFormat.class);
    cacheConf.setMapperClass(Norm2Mapper.class);
    cacheConf.setOutputKeyClass(StringDoublePairWritable.class);
    cacheConf.setOutputValueClass(SparseVectorWritable.class);

    // Set up IdentityReducer
    cacheConf.setReducerClass(IdentityReducer.class);
    cacheConf.setOutputFormat(SequenceFileOutputFormat.class);
    cacheConf.setNumReduceTasks(1);
    Path sfPath = FileUtils.createRemoteTempPath(fs, qInputPath.getParent());
    LOG.info(String.format("Generating feature vector SequenceFile path %s", sfPath.toString()));
    SequenceFileOutputFormat.setOutputPath(cacheConf, sfPath);
    JobClient.runJob(cacheConf);

    Path cachePath = new Path(sfPath.toString() + Path.SEPARATOR + "part-00000");

    // need to know the size (the reducer might be able to send this back via the Reporter, but how do we grab that info?
    StringDoublePairWritable key = new StringDoublePairWritable();
    int size = 0;
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, cachePath, conf);
    boolean hasNext = reader.next(key);
    while (hasNext) {
        size += 1;
        hasNext = reader.next(key);
    }
    try {
        reader.close();
    } catch (IOException ioe) {
        // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
        LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe);
    }

    //LOG.info(String.format("Caching model file %s", qInputPath.toString()));
    URI listURI = new URI(fs.makeQualified(cachePath).toString());
    DistributedCache.addCacheFile(listURI, conf);
    LOG.info(String.format("SequenceFile cache path %s (%s) with %d labels", listURI.toString(),
            cachePath.getName(), size));
    conf.set(CACHE_PATH, cachePath.getName());
    conf.setInt(DISTANCE_MATRIX_SIZE, size);

    /**
     * Main MapReduce Task of generating dot products
     */
    LOG.info("Generating distances");
    JobConf distanceConf = new JobConf(conf, CalculateCosineDistanceMatrix.class);
    distanceConf.setJobName("DistanceMapReduce");
    // Set up distance mapper
    SequenceFileInputFormat.setInputPaths(distanceConf, new Path(input));
    distanceConf.setInputFormat(SequenceFileInputFormat.class);
    distanceConf.setMapperClass(DistanceMap.class);
    distanceConf.setMapOutputKeyClass(Text.class);
    distanceConf.setMapOutputValueClass(SparseVectorWritable.class);

    // Set up reducer to merge lower-triangle results into a single dense distance vector
    distanceConf.setReducerClass(DistanceReducer.class);
    distanceConf.setOutputKeyClass(Text.class);
    distanceConf.setOutputValueClass(DenseVectorWritable.class);
    distanceConf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(distanceConf, new Path(output));
    JobClient.runJob(distanceConf);

    return 0;
}

From source file:org.mitre.ccv.mapred.CompleteCompositionVectors.java

License:Open Source License

/**
 *
 * The JSO data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features
 * will be in a different order. This version, by default sorts, only by entropy values, whereas the
 * ccv in-memory version sorts by the k-mer natural order (i.e., lexigraphic).
 * @param argv// w  w w  .  j  av a2 s .  co m
 * @return
 * @throws java.lang.Exception
 */
@Override
@SuppressWarnings("static-access") // For OptionBuilder
public int run(String[] argv) throws Exception {
    JobConf conf = new JobConf(getConf());
    String cli_title = "CompleteCompositionVectorHadoop";

    int start = CalculateKmerCounts.DEFAULT_START;
    int end = CalculateKmerCounts.DEFAULT_END;
    int topkmers = 0;

    String input = null;
    String output = null;
    String vectorJsonOutput = null;
    //String kmerJsonOutput = null;

    boolean cleanLogs = false;

    /** create the Options */
    Options options = new Options();

    /** Hadoop Options */
    options.addOption(
            OptionBuilder.withArgName("number").hasArg(true).withDescription("number of maps").create("m"));
    options.addOption(
            OptionBuilder.withArgName("number").hasArg(true).withDescription("number of reducers").create("r"));

    // org.hadoop.util.GenericOptionsParser should captures this, but it doesn't
    options.addOption(OptionBuilder.withArgName("property=value").hasArg(true).withValueSeparator()
            .withDescription("use value for given property").create("D"));

    /** CompleteCompositionVector Options */
    options.addOption(OptionBuilder.withArgName("number").hasArg(true)
            .withDescription("number of top k-mers to use in calculations").create("topKmers"));
    options.addOption(OptionBuilder.withArgName("start").hasArg(true).withDescription("starting length of tile")
            .create("start"));
    options.addOption(OptionBuilder.withArgName("end").hasArg(true).withDescription("ending length of title")
            .create("end"));
    options.addOption(OptionBuilder.hasArg(true).withArgName("file")
            .withDescription("JSON file to write out k-mers to").create("kmersfile"));

    options.addOption(OptionBuilder.hasArg(true).withArgName("file")
            .withDescription("JSON file to write out feature vectors to "
                    + "(Overrides kmersout, only one file will be written).")
            .create("vectorsfile"));

    options.addOption(OptionBuilder.withArgName("number").hasArg(true)
            .withDescription("What preference to use: 0-min 1-median 2-avg(min,med): default is median")
            .create("prefval"));

    options.addOption(OptionBuilder.withArgName("help").hasArg(false).withDescription("print this message")
            .create("help"));

    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();

    //GenericOptionsParser gop = new GenericOptionsParser(conf, options, argv);
    GenericOptionsParser gop = new GenericOptionsParser(conf, argv);

    String[] remaining_args = gop.getRemainingArgs();

    // create the parser
    CommandLineParser parser = new GnuParser();
    //CommandLine line = gop.getCommandLine();
    String[] other_args = new String[] {};

    try {
        CommandLine line = parser.parse(options, remaining_args);
        other_args = line.getArgs();

        // Make sure there is a parameter left.
        if (other_args.length == 0) {
            System.out.println(cli_title);
            System.out.println("Missing input path!");
            formatter.printHelp("hccv [options] <input> [<output>] ", options);
            GenericOptionsParser.printGenericCommandUsage(System.out);
            return -1;
        }

        Option[] opts = line.getOptions();
        if (line.hasOption("help")) {
            System.out.println(cli_title);
            formatter.printHelp("hccv [options] <input> [<output>] ", options);
            GenericOptionsParser.printGenericCommandUsage(System.out);
            return -1;
        }

        // could also use line.iterator()
        for (Option opt : opts) {
            if (opt.getOpt().equals("m")) {
                conf.setNumMapTasks(Integer.parseInt(opt.getValue()));
            }
            if (opt.getOpt().equals("r")) {
                conf.setNumReduceTasks(Integer.parseInt(opt.getValue()));
            }
            if (opt.getOpt().equals("D")) {
                // We can have multiple properties we want to set
                String[] properties = opt.getValues();
                for (String property : properties) {
                    String[] keyval = property.split("=");
                    conf.set(keyval[0], keyval[1]);
                }
            }
            if (opt.getOpt().equals("start")) {
                start = Integer.parseInt(opt.getValue());
            }
            if (opt.getOpt().equals("end")) {
                end = Integer.parseInt(opt.getValue());
            }
            if (opt.getOpt().equals("topKmers")) {
                topkmers = Integer.parseInt(opt.getValue());
            }
            if (opt.getOpt().equals("vectorsfile")) {
                vectorJsonOutput = opt.getValue();
            }
        }
    } catch (ParseException e) {
        LOG.warn("options parsing faild: " + e.getMessage());
        System.out.println(cli_title);
        formatter.printHelp("hccv [options] <input> [<output>] ", options);
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }
    if (start <= 2) {
        throw new IllegalArgumentException("Value of 'start' argument must be larger than 2");
    }

    input = other_args[0];
    if (other_args.length < 2) {
        output = input + "_" + FileUtils.getSimpleDate();
    } else {
        output = other_args[2];
    }

    /**
     * Check output path. Either needs to exist as a directory or not exist
     */
    Path outputPath = new Path(output);
    FileSystem fs = outputPath.getFileSystem(conf);
    if (!fs.exists(outputPath)) {
        fs.mkdirs(outputPath);
    } else if (fs.exists(outputPath) || !fs.getFileStatus(outputPath).isDir()) {
        LOG.fatal(String.format("Output directory %s already exists", outputPath.makeQualified(fs)));
        throw new FileAlreadyExistsException(
                String.format("Output directory %s already exists", outputPath.makeQualified(fs)));
    }

    String outputDir = output + Path.SEPARATOR;

    int res;
    /**
     * Zero, CalculateCompositionVectors
     */
    LOG.info("Starting CalculateCompositionVectors Map-Reduce job");
    CalculateCompositionVectors cv = new CalculateCompositionVectors();
    res = cv.initJob(conf, start, end, input, outputDir + COMPOSITION_VECTORS, cleanLogs);
    if (res != 0) {
        LOG.info("CalculateCompositionVectors returned non-zero result!");
        return res;
    }
    // We can stop now or continue to reduce dimensionallity using RRE or other means

    /**
     * First, CalculateKmerCounts
     */
    LOG.info("Starting CalculateKmerCounts Map-Reduce job");
    // FastMap option for CalculateKmers!?!
    CalculateKmerCounts ckc = new CalculateKmerCounts();
    res = ckc.initJob(conf, start, end, input, outputDir + KMER_COUNTS);
    if (res != 0) {
        LOG.fatal("CalculateKmerCounts returned non-zero result!");
        return res;
    }

    /**
     * Second, TotalSequenceLength
     */
    LOG.info("Starting TotalSequenceLength Map-Reduce job");
    TotalSequenceLength tsl = new TotalSequenceLength();
    res = tsl.initJob(conf, input, outputDir + TOTAL_LENGTH, cleanLogs);
    if (res != 0) {
        LOG.fatal("TotalSequenceLength returned non-zero result!");
        return res;
    }
    int length = tsl.getCount(conf, outputDir + TOTAL_LENGTH);

    if (length < 3) {
        LOG.fatal("TotalSequenceLength returned a total sequence length of less than 3.");
        return -1;
    } else {
        LOG.info(String.format("TotalSequenceLength returned a total sequence length of %d.", length));
    }

    /**
     * Third, CalculateKmerProbabilities
     */
    LOG.info("Starting CalculateKmerProbabilities Map-Reduce job");
    CalculateKmerProbabilities ckp = new CalculateKmerProbabilities();
    res = ckp.initJob(conf, start, end, length, outputDir + KMER_COUNTS, outputDir + KMER_PROBABILITIES,
            cleanLogs);
    if (res != 0) {
        LOG.fatal("CalculateKmerProbabilities returned non-zero result!");
        return res;
    }

    /**
     * Fourth, InvertKmerProbabilities
     */
    LOG.info("Starting InvertKmerProbabilities Map-Reduce job");
    InvertKmerProbabilities ikp = new InvertKmerProbabilities();
    res = ikp.initJob(conf, outputDir + KMER_PROBABILITIES, outputDir + INVERTED_KMER_PROBABILITIES, cleanLogs);
    if (res != 0) {
        LOG.fatal("InvertKmerProbabilities returned non-zero result!");
        return res;
    }

    /**
     * Fifth, CalculateKmerPiValues
     */
    LOG.info("Starting CalculateKmerPiValues Map-Reduce job");
    CalculateKmerPiValues kpv = new CalculateKmerPiValues();
    res = kpv.initJob(conf, start, end, outputDir + INVERTED_KMER_PROBABILITIES, outputDir + KMER_PI_VALUES,
            cleanLogs);
    if (res != 0) {
        LOG.fatal("CalculateKmerPiValues returned non-zero result!");
        return res;
    }

    /**
     * Sixth,CalculateKmerRevisedRelativeEntropy
     */
    LOG.info("Starting CalculateKmerRevisedRelativeEntropy Map-Reduce job");
    CalculateKmerRevisedRelativeEntropy krre = new CalculateKmerRevisedRelativeEntropy();
    res = krre.initJob(conf, outputDir + KMER_PI_VALUES, outputDir + COMPOSITION_VECTORS,
            outputDir + ENTROPY_VALUES, cleanLogs);
    if (res != 0) {
        LOG.fatal("CalculateKmerRevisedRelativeEntropy returned non-zero result!");
        return res;
    }

    /**
     * Seventh, SortKmerRevisedRelativeEntropies
     */
    SortKmerRevisedRelativeEntropies srre = new SortKmerRevisedRelativeEntropies();
    res = srre.initJob(conf, outputDir + ENTROPY_VALUES, outputDir + SORTED_ENTROPY_VALUES, cleanLogs);
    if (res != 0) {
        LOG.fatal("SortKmerRevisedRelativeEntropies returned non-zero result!");
        return res;
    }

    /**
     * Eigth, GenerateFeatureVectors
     *
     * Generate a flatten list to add to the cache to be distributed to the map-tasks.
     */
    Path listOutputPath = new Path(outputDir + Integer.toString(topkmers) + KMER_ENTROPY_SET);
    LOG.info(String.format("Loading %d sorted k-mers from %s to %s", topkmers,
            outputDir + SORTED_ENTROPY_VALUES, listOutputPath.toString()));
    int num = CompleteCompositionVectorUtils.flattenKmerEntropySequenceFile(conf, topkmers,
            outputDir + SORTED_ENTROPY_VALUES, listOutputPath.toString(), cleanLogs);

    if (num != topkmers) {
        LOG.fatal(String.format("Requested %d k-mers, but got %d. Using %d", topkmers, num, num));
        topkmers = num;
    }
    GenerateFeatureVectors fv = new GenerateFeatureVectors();
    res = fv.initJob(conf, listOutputPath.toString(), topkmers, outputDir + COMPOSITION_VECTORS,
            outputDir + FEATURE_VECTORS, cleanLogs);
    if (res != 0) {
        LOG.fatal("GenerateFeatureVectors returned non-zero result!");
        return res;
    }

    /**
     * Save feature vectors, features (k-mers), and properties to a JSON file.
     *
     * The data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features
     * will be in a different order. This version, by default sorts, only by entropy values, whereas the
     * ccv in-memory version sorts by the k-mer natural order (i.e., lexigraphic).
     */
    if (vectorJsonOutput != null && vectorJsonOutput.length() > 0) {
        LOG.info("Writing features out to " + vectorJsonOutput);
        CompleteCompositionVectorUtils.featureVectors2Json(conf, start, end, topkmers,
                outputDir + SORTED_ENTROPY_VALUES, outputDir + FEATURE_VECTORS, vectorJsonOutput);
    }

    LOG.info("All done generating complete composition vectors and feature vectors.");
    return res;
}

From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java

License:Open Source License

/**
 * Returns the given number of k-mers from {@link SequenceFile}s containing {@link KmerEntropyPairWritable} as the keys.
 *
 * @param conf      JobConf/*from   w ww .ja v  a  2s  .  c o  m*/
 * @param input     path to SequenceFile
 * @param numKmers  the number of k-mers to return (if null or 0, all will be returned).
 * @return          {@link TreeSet} of sorted (see {@link KmerEntropyPairWritable} k-mers.
 * @throws java.io.IOException
 */
public static TreeSet<String> getKmerEntropiesFromSequenceFile(JobConf conf, String input, Integer length)
        throws IOException {
    TreeSet<String> nmers = new TreeSet<String>();
    Path inputPath = new Path(input);
    FileSystem fs = inputPath.getFileSystem(conf);
    //Path inputPath = fs.makeQualified(path);
    Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
    if (length == null || length <= 0) {
        length = Integer.MAX_VALUE;
    }
    int cnt = 0;
    for (int idx = 0; idx < paths.length; idx++) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
        KmerEntropyPairWritable key = new KmerEntropyPairWritable();
        boolean hasNext = true;
        while (hasNext && cnt < length) {
            hasNext = reader.next(key);
            nmers.add(key.getKey());
            cnt++;
        }
    }
    return nmers;
}

From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java

License:Open Source License

/**
 * Flattens a {@link SequenceFile} containing {@link KmerEntropyPairWritable}s as keys to a file
 * containing only the keys as {@link KmerEntropyPairWritable} in the same order.
 *
 * @param conf//from   w  w  w  .  java2 s  .  com
 * @param numKmers
 * @param input     the input path containing the kmers.
 * @param output    the output file path to write the keys to.
 * @param asText    if <code>true</code>, then save keys and values as text. Otherwise, save as {@link Writable}s
 * @return          the actual number written out.
 * @throws java.io.IOException
 */
public static synchronized int flattenKmerEntropySequenceFile(JobConf conf, int numKmers, String input,
        String output, boolean asText) throws IOException {
    if (LOG.isDebugEnabled()) {
        LOG.debug(String.format("Flattening %d k-mers entropies from %s to %s", numKmers, input, output));
    }
    Path outPath = new Path(output);
    FileSystem fs = outPath.getFileSystem(conf);

    FSDataOutputStream fos = fs.create(outPath, true); // throws nothing!
    Path inputPath = new Path(input);
    Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
    if (numKmers <= 0) {
        numKmers = Integer.MAX_VALUE;
    }
    int cnt = 0;
    for (int idx = 0; idx < paths.length; idx++) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
        KmerEntropyPairWritable key = new KmerEntropyPairWritable();
        boolean hasNext = true;
        while (hasNext && cnt < numKmers) {
            hasNext = reader.next(key);
            if (asText) {
                fos.writeUTF(key.toString());
            } else {
                key.write(fos);
            }
            cnt++;
        }
        try {
            fos.close();
            reader.close();
        } catch (IOException ioe) {
            // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
            LOG.debug("Probably caused by closing the SequenceFile.Reader", ioe);
        }
    }
    return cnt;
}

From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java

License:Open Source License

/**
 * Writes out the {@link SequenceFile} feature vectors in row major (packed) order. No labels are outputed.
 *
 * @param jobConf//from   w ww .  ja v  a  2 s . c o m
 * @param input     top level SequenceFile directory path
 * @param output    path to output the matrix
 * @param digits    the maximum number of fraction digits
 * @throws IOException
 */
public static void featureVectors2RowMajorMatrix(JobConf jobConf, String input, String output, int digits)
        throws IOException {
    JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);

    DecimalFormat format = new DecimalFormat();
    format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US));
    format.setMinimumIntegerDigits(1);
    format.setMaximumFractionDigits(digits);
    //format.setMinimumFractionDigits(fractionDigits);
    format.setGroupingUsed(false);

    final Path inputPath = new Path(input);
    final FileSystem fs = inputPath.getFileSystem(conf);
    final Path qInputPath = fs.makeQualified(inputPath);
    final Path outputPath = new Path(output);
    Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*");

    FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing!
    final Writer writer = new OutputStreamWriter(fos);
    final Text key = new Text();
    final SparseVectorWritable value = new SparseVectorWritable();
    for (int idx = 0; idx < paths.length; idx++) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
        boolean hasNext = reader.next(key, value);
        while (hasNext) {

            final SparseVector vector = value.get();
            final StringBuilder sb = new StringBuilder();
            for (int i = 0; i < vector.getCardinality(); i++) {
                final String s = format.format(vector.get(i)); // format the number
                sb.append(s);
                sb.append(' ');
            }
            writer.write(sb.toString());
            hasNext = reader.next(key, value);
        }
        try {
            writer.flush();
            reader.close();
        } catch (IOException ioe) {
            // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
            LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe);
        }
    }
    try {
        writer.close();
        fos.flush();
        fos.close();
    } catch (IOException ioe) {
        LOG.debug("Caused by distributed cache output stream.", ioe);
    }
}

From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java

License:Open Source License

/**
 * Flattens a {@link SequenceFile} containing {@link KmerEntropyPairWritable}s as keys to a json file
 * containing the k-mers (<tt>features</tt>) in the same order, along with the start and end window sizes.
 *
 * @param conf/* ww w.  j  a v a2 s .c o  m*/
 * @param numKmers      the number of k-mers to return (if 0 or less, all will be returned).
 * @param input         the input path containing the kmers.
 * @param output        the output file path to write the json file to.
 * @return the actual number of kmers written out
 * @throws              java.io.IOException
 */
public static int kmerSequenceFile2Json(JobConf conf, int start, int end, int numKmers, String input,
        String output) throws IOException {
    Path outPath = new Path(output);
    FileSystem fs = outPath.getFileSystem(conf);

    FSDataOutputStream fos = fs.create(outPath, true); // throws nothing!
    Path inputPath = new Path(input);
    Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
    if (numKmers <= 0) {
        numKmers = Integer.MAX_VALUE;
    }
    int cnt = 0;
    Writer writer = new OutputStreamWriter(fos);
    JsonFactory jf = new JsonFactory();
    JsonGenerator jg = jf.createJsonGenerator(writer);
    CompleteCompositionVectorUtils util = new CompleteCompositionVectorUtils();
    try {
        jg.writeStartObject();
        util.writeJsonCcvProperties(jg, start, end);
        cnt = util.writeJsonKmers(conf, fs, paths, jg, numKmers);
        jg.writeEndObject();
        jg.close();
        writer.close();
    } catch (JsonGenerationException ex) {
        LOG.error("Unable to write the nmers to a json object", ex);
    }
    return cnt;
}

From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java

License:Open Source License

/**
 * Write out feature vectors, features (k-mers), and properties (start, end) to a JSON file.
 * <P>JSON format//ww w  .j  a v a 2 s  .c  om
 * <blockquote>
 * {
 *      "properties" :
 *      {
 *          "begin" : 3
 *          "end"   : 9
 *      }
 *      "features" : [..]
 *      "samples" :
 *      [
 *          {
 *              "name" : "sample name",
 *              "data" : { nmer_index: non-zero pi-values }
 *          }, ....
 *      ]
 * }
 * </blockquote>
 *
 * The data will be the same as  {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features
 * will be in a different order. The mapred version, by default sorts, only by entropy value, whereas the
 * ccv in-memory version sorts by the k-mer natural order (lexigraphic).
 *
 * @see {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}
 *
 * @param conf          the job configuration
 * @param start         begining window size
 * @param end           ending window size
 * @param numKmers      the number of k-mers to return (if 0 or less, all will be returned).
 * @param listInput     {@link SequenceFile} path containing k-mers used to generate the feature vectors.
 * @param featureInput  {@link SequenceFile} path contains feature vectors {@link SparseVectorWritable}.
 * @param output        the output file path to write the json file to.
 * @return the actual number of kmers written out (not samples/feature vectors)
 * @throws java.io.IOException
 */
public static int featureVectors2Json(JobConf conf, int start, int end, int numKmers, String listInput,
        String featureInput, String output) throws IOException {
    Path outPath = new Path(output);
    FileSystem fs = outPath.getFileSystem(conf);

    FSDataOutputStream fos = fs.create(outPath, true); // throws nothing!
    if (numKmers <= 0) {
        numKmers = Integer.MAX_VALUE;
    }
    Writer writer = new OutputStreamWriter(fos);

    JsonFactory jf = new JsonFactory();
    JsonGenerator jg = jf.createJsonGenerator(writer);
    CompleteCompositionVectorUtils util = new CompleteCompositionVectorUtils();
    int cnt = 0;
    try {
        jg.writeStartObject();
        util.writeJsonCcvProperties(jg, start, end);

        /** Get k-mers (features) */
        Path inputPath = new Path(listInput);
        Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
        cnt = util.writeJsonKmers(conf, fs, paths, jg, numKmers);

        /** Get samples */
        inputPath = new Path(featureInput);
        paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
        util.jsonCcvVectors(conf, fs, paths, jg);
        jg.writeEndObject();
        jg.close();
        writer.close();
    } catch (JsonGenerationException ex) {
        LOG.error("Unable to write the nmers to a json object", ex);
    }
    return cnt;
}

From source file:org.mitre.ccv.mapred.SortKmerRevisedRelativeEntropies.java

License:Open Source License

/**
 * Returns the given number of k-mers from {@link SequenceFile}s generated by this class.
 *
 * @param conf/* w  w  w .j  a  va 2s .  c o  m*/
 * @param input the path containing the <code>SequenceFile</code> parts.
 * @param m     the number of k-mers to return. If <= 0, then {@link Integer.MAX_VALUE} is returned.
 * @return
 * @throws java.io.IOException
 */
static TreeSet<String> getkmers(JobConf conf, String input, Integer m) throws IOException {
    TreeSet<String> nmers = new TreeSet<String>();
    Path inputPath = new Path(input);
    FileSystem fs = inputPath.getFileSystem(conf);
    //Path inputPath = fs.makeQualified(path);
    Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
    if (m <= 0) {
        m = Integer.MAX_VALUE;
    }
    int cnt = 0;
    for (int idx = 0; idx < paths.length; idx++) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
        KmerEntropyPairWritable key = new KmerEntropyPairWritable();
        boolean hasNext = true;
        while (hasNext && cnt < m) {
            hasNext = reader.next(key);
            nmers.add(key.getKey());
            cnt++;
        }
    }
    return nmers;
}