List of usage examples for org.apache.hadoop.fs FileSystem makeQualified
public Path makeQualified(Path path)
From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java
License:Open Source License
/** * Writes out the matrix in row major (packed) order. No labels are outputed. * * @param jobConf/* w w w . j a v a 2 s. c o m*/ * @param input * @param output * @param digits * @throws IOException */ public static void printRowMajorMatrix(JobConf jobConf, String input, String output, int digits) throws IOException { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); DecimalFormat format = new DecimalFormat(); format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US)); format.setMinimumIntegerDigits(1); format.setMaximumFractionDigits(digits); //format.setMinimumFractionDigits(fractionDigits); format.setGroupingUsed(false); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); final Path outputPath = new Path(output); Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*"); FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing! final Writer writer = new OutputStreamWriter(fos); final Text key = new Text(); final DenseVectorWritable value = new DenseVectorWritable(); for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); boolean hasNext = reader.next(key, value); while (hasNext) { final DenseVector vector = value.get(); final StringBuilder sb = new StringBuilder(); for (int i = 0; i < vector.getCardinality(); i++) { final String s = format.format(vector.get(i)); // format the number sb.append(s); sb.append(' '); } writer.write(sb.toString()); hasNext = reader.next(key, value); } try { writer.flush(); reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } } try { writer.close(); fos.flush(); fos.close(); } catch (IOException ioe) { LOG.debug("Caused by distributed cache output stream.", ioe); } }
From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java
License:Open Source License
/** * Outputs the distance matrix (DenseVectors) in Phylip Square format. Names/labels are limited to 10-characters! * * @param jobConf// ww w .j av a2s. c o m * @param input input directory name containing DenseVectors (as generated by this class). * @param output output file name * @param fractionDigits number of digits after decimal point * @throws IOException */ public static void printPhylipSquare(JobConf jobConf, String input, String output, int fractionDigits) throws IOException { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); DecimalFormat format = new DecimalFormat(); format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US)); format.setMinimumIntegerDigits(1); format.setMaximumFractionDigits(fractionDigits); //format.setMinimumFractionDigits(fractionDigits); format.setGroupingUsed(false); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); final Path outputPath = new Path(output); Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*"); FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing! Writer writer = new OutputStreamWriter(fos); Text key = new Text(); DenseVectorWritable value = new DenseVectorWritable(); Boolean wroteHeader = false; for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); boolean hasNext = reader.next(key, value); while (hasNext) { final DenseVector vector = value.get(); if (!wroteHeader) { writer.write(String.format("\t%d\n", vector.getCardinality())); wroteHeader = true; } final StringBuilder sb = new StringBuilder(); final String name = key.toString(); sb.append(name.substring(0, (name.length() > 10 ? 10 : name.length()))); final int padding = Math.max(1, 10 - name.length()); for (int k = 0; k < padding; k++) { sb.append(' '); } sb.append(' '); for (int i = 0; i < vector.getCardinality(); i++) { final String s = format.format(vector.get(i)); // format the number sb.append(s); sb.append(' '); } sb.append("\n"); writer.write(sb.toString()); hasNext = reader.next(key, value); } try { writer.flush(); reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } } try { writer.close(); fos.flush(); fos.close(); } catch (IOException ioe) { LOG.debug("Caused by distributed cache output stream.", ioe); } }
From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java
License:Open Source License
public int initJob(JobConf jobConf, String input, String output) throws Exception { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); /**//from w w w .j av a2 s . c om * Need to get all of the sample names/labels */ JobConf cacheConf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); cacheConf.setJobName("CacheNorm2MapReduce"); cacheConf.setNumReduceTasks(1); // Want ONE part file // Set up IdentityMapper SequenceFileInputFormat.setInputPaths(cacheConf, new Path(input)); cacheConf.setInputFormat(SequenceFileInputFormat.class); cacheConf.setMapperClass(Norm2Mapper.class); cacheConf.setOutputKeyClass(StringDoublePairWritable.class); cacheConf.setOutputValueClass(SparseVectorWritable.class); // Set up IdentityReducer cacheConf.setReducerClass(IdentityReducer.class); cacheConf.setOutputFormat(SequenceFileOutputFormat.class); cacheConf.setNumReduceTasks(1); Path sfPath = FileUtils.createRemoteTempPath(fs, qInputPath.getParent()); LOG.info(String.format("Generating feature vector SequenceFile path %s", sfPath.toString())); SequenceFileOutputFormat.setOutputPath(cacheConf, sfPath); JobClient.runJob(cacheConf); Path cachePath = new Path(sfPath.toString() + Path.SEPARATOR + "part-00000"); // need to know the size (the reducer might be able to send this back via the Reporter, but how do we grab that info? StringDoublePairWritable key = new StringDoublePairWritable(); int size = 0; SequenceFile.Reader reader = new SequenceFile.Reader(fs, cachePath, conf); boolean hasNext = reader.next(key); while (hasNext) { size += 1; hasNext = reader.next(key); } try { reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } //LOG.info(String.format("Caching model file %s", qInputPath.toString())); URI listURI = new URI(fs.makeQualified(cachePath).toString()); DistributedCache.addCacheFile(listURI, conf); LOG.info(String.format("SequenceFile cache path %s (%s) with %d labels", listURI.toString(), cachePath.getName(), size)); conf.set(CACHE_PATH, cachePath.getName()); conf.setInt(DISTANCE_MATRIX_SIZE, size); /** * Main MapReduce Task of generating dot products */ LOG.info("Generating distances"); JobConf distanceConf = new JobConf(conf, CalculateCosineDistanceMatrix.class); distanceConf.setJobName("DistanceMapReduce"); // Set up distance mapper SequenceFileInputFormat.setInputPaths(distanceConf, new Path(input)); distanceConf.setInputFormat(SequenceFileInputFormat.class); distanceConf.setMapperClass(DistanceMap.class); distanceConf.setMapOutputKeyClass(Text.class); distanceConf.setMapOutputValueClass(SparseVectorWritable.class); // Set up reducer to merge lower-triangle results into a single dense distance vector distanceConf.setReducerClass(DistanceReducer.class); distanceConf.setOutputKeyClass(Text.class); distanceConf.setOutputValueClass(DenseVectorWritable.class); distanceConf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(distanceConf, new Path(output)); JobClient.runJob(distanceConf); return 0; }
From source file:org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java
License:Open Source License
/** * Writes out the {@link SequenceFile} feature vectors in row major (packed) order. No labels are outputed. * * @param jobConf//from w w w. j av a 2 s. c om * @param input top level SequenceFile directory path * @param output path to output the matrix * @param digits the maximum number of fraction digits * @throws IOException */ public static void featureVectors2RowMajorMatrix(JobConf jobConf, String input, String output, int digits) throws IOException { JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class); DecimalFormat format = new DecimalFormat(); format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US)); format.setMinimumIntegerDigits(1); format.setMaximumFractionDigits(digits); //format.setMinimumFractionDigits(fractionDigits); format.setGroupingUsed(false); final Path inputPath = new Path(input); final FileSystem fs = inputPath.getFileSystem(conf); final Path qInputPath = fs.makeQualified(inputPath); final Path outputPath = new Path(output); Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*"); FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing! final Writer writer = new OutputStreamWriter(fos); final Text key = new Text(); final SparseVectorWritable value = new SparseVectorWritable(); for (int idx = 0; idx < paths.length; idx++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf); boolean hasNext = reader.next(key, value); while (hasNext) { final SparseVector vector = value.get(); final StringBuilder sb = new StringBuilder(); for (int i = 0; i < vector.getCardinality(); i++) { final String s = format.format(vector.get(i)); // format the number sb.append(s); sb.append(' '); } writer.write(sb.toString()); hasNext = reader.next(key, value); } try { writer.flush(); reader.close(); } catch (IOException ioe) { // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe); } } try { writer.close(); fos.flush(); fos.close(); } catch (IOException ioe) { LOG.debug("Caused by distributed cache output stream.", ioe); } }
From source file:org.mitre.ccv.mapred.GenerateFeatureVectors.java
License:Open Source License
/** * Start a new job with the given configuration and parameters. * * @param jobConf//from w w w . j a v a 2s. co m * @param listInput file path containing list of k-mers to use * @param cardinality number of k-mers to use (if list contains less,then that will be used instead). * @param input composition vector {@link SequenceFile} such as generated by {@link CalculateCompositionVectors} * @param output * @param cleanLogs * @return zero if no errors * @throws java.lang.Exception */ public int initJob(JobConf jobConf, String listInput, Integer cardinality, String input, String output, boolean cleanLogs) throws Exception { JobConf conf = new JobConf(jobConf, GenerateFeatureVectors.class); conf.setJobName("GenerateFeatureVectors"); Path listPath = new Path(listInput); // i.e, listInput = win32_200902260829/kmer_120811a7fa1_tmp FileSystem fs = listPath.getFileSystem(conf); if (listInput != null) { // @todo: should check to see if it is there! // It doesn't say it, but we need the quailifed path with the host name // otherwise URI sticks the host on to it not so nicely Path qPath = fs.makeQualified(listPath); // listPath = hdfs://rocks5.local:54310/user/mcolosimo/win32_200902260829/kmer_120811a7fa1_tmp LOG.info(String.format("Caching k-mer file %s", qPath.toString())); // URI:hdfs://rocks5.local:54310/user/mcolosimo/win32_200902260829/kmer_120811a7fa1_tmp URI listURI = new URI(qPath.toString()); DistributedCache.addCacheFile(listURI, conf); conf.set(KMER_LIST, listPath.getName()); //LOG.info("k-mer URI:" + listURI.toString()); } else { throw new Exception("GenerateFeatureVectors requires a list of k-mers!"); } /** We need this. It is okay if the cardinality is larger than the number of k-mers. */ if (cardinality == null) { LOG.info("Scanning k-mer file to determine cardinality"); FSDataInputStream ins = fs.open(listPath); KmerEntropyPairWritable w = new KmerEntropyPairWritable(); int c = 0; while (ins.available() > 0) { w.readFields(ins); c++; } ins.close(); fs.close(); LOG.info(String.format("Found %d k-mers in the file", c)); cardinality = c; } conf.setInt(VECTOR_CARDINALITY, cardinality); // Set up mapper SequenceFileInputFormat.setInputPaths(conf, new Path(input)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapperClass(CompositionVectorMap.class); conf.setOutputKeyClass(Text.class); // final output key class - sample name conf.setOutputValueClass(SparseVectorWritable.class); // final output value class // Set up combiner/reducer conf.setReducerClass(Features2VectorReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); return 0; }
From source file:org.mitre.ccv.weka.mapred.ClassifyInstances.java
License:Open Source License
public int initJob(JobConf jobConf, String modelInput, String input, String output) throws Exception { JobConf conf = new JobConf(jobConf, ClassifyInstances.class); conf.setJobName("ClassifyInstances"); Path listPath = new Path(modelInput); FileSystem fs = listPath.getFileSystem(conf); if (modelInput != null) { Path qPath = fs.makeQualified(listPath); LOG.info(String.format("Caching model file %s", qPath.toString())); URI listURI = new URI(qPath.toString()); DistributedCache.addCacheFile(listURI, conf); conf.set(MODEL_PATH, listPath.getName()); } else {// w ww . j a va2 s. co m throw new Exception("ClassifyInstances requires a model!"); } // Set up mapper SequenceFileInputFormat.setInputPaths(conf, new Path(input)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapperClass(CompositionVectorJ48Map.class); // Painful way to set job output key class because we can't use WritableComparable String sortBy = conf.get(SORT_OUTPUT_BY, null); if (sortBy != null && !sortBy.equals(SORTBY_SAMPLE)) { LOG.info("Sorting output by class name and/or confidence."); conf.setOutputKeyClass(StringDoublePairWritable.class); } else { LOG.info("Sorting output by sample name."); conf.setOutputKeyClass(Text.class); } conf.setOutputValueClass(Text.class); // job output value class // Uses default reducer (IdentityReducer) and save it to a plain text file conf.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); return 0; }
From source file:org.mitre.mapred.fs.FileUtils.java
License:Open Source License
/** * Returns a tmp path on the remote FileSystem. * * @param fs//from w w w .j a v a2s . c o m * @param basePath * @return The path * @throws java.io.IOException */ public static final Path createRemoteTempPath(FileSystem fs, Path basePath) throws IOException { long now = System.currentTimeMillis(); // @TODO: add constant and look up tmp dir name Path tmpDirPath = new Path(basePath.toString() + Path.SEPARATOR + "tmp_" + Long.toHexString(now)); // check to see if unqiue? return fs.makeQualified(tmpDirPath); }
From source file:org.mrgeo.format.CsvInputFormatTest.java
License:Apache License
@Test @Category(UnitTest.class) public void testBasics() throws Exception { // this class and its unit tests are a work in progress. FileSystem fs = new RawLocalFileSystem(); try {//from w w w . j a v a 2 s. co m Job j = new Job(new Configuration()); Configuration c = j.getConfiguration(); fs.setConf(c); Path testFile = new Path(input, "testBasics.csv"); testFile = fs.makeQualified(testFile); FileInputFormat.addInputPath(j, testFile); FileSplit split = new FileSplit(testFile, 0, 500, null); CsvInputFormat.CsvRecordReader reader = new CsvInputFormat.CsvRecordReader(); reader.initialize(split, HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID())); @SuppressWarnings("unused") int l = 0; StringBuffer buf = new StringBuffer(); String[] base = { "word1:Hello word2:world number:1 ", "word1:foo word2:bar number:2 ", "word1:cat word2:dog number:3 ", "word1:rock word2:paper number:4 ", "word1:red word2:blue, number:5 ", "word1:,green, word2:,, number:6 ", }; int index = 0; while (reader.nextKeyValue()) { Geometry f = reader.getCurrentValue(); String row = ""; for (Map.Entry attr : f.getAllAttributes().entrySet()) { row += attr.getKey() + ":" + attr.getValue() + " "; } Assert.assertEquals("Error in row " + index, base[index++], row); } // This hash code will tell us if anything changes then it can be manually verified. } catch (Exception e) { e.printStackTrace(); throw e; } finally { fs.close(); } }
From source file:org.mrgeo.format.CsvInputFormatTest.java
License:Apache License
@Test @Category(UnitTest.class) public void testNullProcessing() throws Exception { // this class and its unit tests are a work in progress. FileSystem fs = new RawLocalFileSystem(); try {//w w w . ja v a2s .c o m Job j = new Job(new Configuration()); Configuration c = j.getConfiguration(); fs.setConf(c); Path testFile = new Path(input, "testNullValues.csv"); testFile = fs.makeQualified(testFile); FileInputFormat.addInputPath(j, testFile); FileSplit split = new FileSplit(testFile, 0, 500, null); CsvInputFormat.CsvRecordReader reader = new CsvInputFormat.CsvRecordReader(); reader.initialize(split, HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID())); @SuppressWarnings("unused") int l = 0; //StringBuffer buf = new StringBuffer(); // Test specific rows returned to make sure the values are as expected. Assert.assertTrue(reader.nextKeyValue()); Geometry f = reader.getCurrentValue(); Assert.assertNotNull(f); Assert.assertEquals("test1", f.getAttribute("string1")); Assert.assertEquals(1.0, Double.parseDouble(f.getAttribute("int1")), EPSILON); Assert.assertEquals(1.5, Double.parseDouble(f.getAttribute("double1")), EPSILON); // Row 2 check Assert.assertTrue(reader.nextKeyValue()); f = reader.getCurrentValue(); Assert.assertNotNull(f); Assert.assertEquals("test2", f.getAttribute("string1")); Assert.assertEquals(2.0, Double.parseDouble(f.getAttribute("int1")), EPSILON); Assert.assertNull("Expected null value instead of: " + f.getAttribute("double1"), f.getAttribute("2")); // Row 3 check Assert.assertTrue(reader.nextKeyValue()); f = reader.getCurrentValue(); Assert.assertNotNull(f); Assert.assertEquals("test3", f.getAttribute("string1")); Assert.assertEquals(3.0, Double.parseDouble(f.getAttribute("int1")), EPSILON); Assert.assertEquals(3.5, Double.parseDouble(f.getAttribute("double1")), EPSILON); // Row 4 check Assert.assertTrue(reader.nextKeyValue()); f = reader.getCurrentValue(); Assert.assertNotNull(f); Assert.assertEquals("test4", f.getAttribute("string1")); Assert.assertNull("Expected null value instead of: " + f.getAttribute("int1"), f.getAttribute("1")); Assert.assertEquals(4.5, Double.parseDouble(f.getAttribute("double1")), EPSILON); // Row 5 check Assert.assertTrue(reader.nextKeyValue()); f = reader.getCurrentValue(); Assert.assertNotNull(f); Assert.assertEquals("test5", f.getAttribute("string1")); Assert.assertEquals(5.0, Double.parseDouble(f.getAttribute("int1")), EPSILON); Assert.assertEquals(5.5, Double.parseDouble(f.getAttribute("double1")), EPSILON); // Row 6 check Assert.assertTrue(reader.nextKeyValue()); f = reader.getCurrentValue(); Assert.assertNotNull(f); Assert.assertEquals("test6", f.getAttribute("string1")); Assert.assertEquals("", f.getAttribute("int1")); Assert.assertEquals("", f.getAttribute("double1")); // Row 7 check Assert.assertTrue(reader.nextKeyValue()); f = reader.getCurrentValue(); Assert.assertNotNull(f); Assert.assertEquals("test7", f.getAttribute("string1")); Assert.assertNull("Expected null value instead of: " + f.getAttribute("int1"), f.getAttribute("int1")); Assert.assertNull("Expected null value instead of: " + f.getAttribute("double1"), f.getAttribute("double1")); Assert.assertFalse(reader.nextKeyValue()); } catch (Exception e) { e.printStackTrace(); throw e; } finally { fs.close(); } }
From source file:org.mrgeo.format.CsvInputFormatTest.java
License:Apache License
@Test @Category(UnitTest.class) public void testNullIgnore() throws Exception { FileSystem fs = new RawLocalFileSystem(); try {//from ww w. j a v a 2s .c o m int lineCount = 0; // Write columns file which defines the columns title and type String cstr = "<?xml version='1.0' encoding='UTF-8'?>\n<AllColumns firstLineHeader='false'>\n"; cstr += " <Column name='name' type='Nominal'/>\n"; cstr += " <Column name='x' type='Numeric'/>\n"; cstr += " <Column name='y' type='Numeric'/>\n"; cstr += "</AllColumns>\n"; FileOutputStream fos = new FileOutputStream(output + "/nulXY.csv.columns"); PrintStream ps = new PrintStream(fos); ps.print(cstr); ps.close(); // Write csv test data fos = new FileOutputStream(output + "/nullXY.csv"); ps = new PrintStream(fos); // populated rows for (int ii = 0; ii < 10; ii++) { ps.print("ASDF,1.0,1.0\n"); lineCount++; } // empty rows ps.print("ASDF,,1.0\n"); ps.print("ASDF,1.0,\n"); ps.print("ASDF,,\n"); lineCount += 3; // populated rows for (int ii = 0; ii < 5; ii++) { ps.print("ASDF,1.0,1.0\n"); lineCount++; } ps.close(); System.out.println(output + "nulXY.csv"); Job j = new Job(new Configuration()); Configuration c = j.getConfiguration(); fs.setConf(c); Path testFile = new Path(output, "nullXY.csv"); testFile = fs.makeQualified(testFile); InputSplit split; long l; long start; TextInputFormat format = new TextInputFormat(); split = new FileSplit(testFile, 0, lineCount * 1000, null); RecordReader<LongWritable, Text> reader2 = format.createRecordReader(split, HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID())); reader2.initialize(split, HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID())); l = 0; start = System.currentTimeMillis(); while (reader2.nextKeyValue()) { reader2.getCurrentValue().toString(); l++; } Assert.assertEquals(lineCount, l); System.out.printf("text line reader with null x,y ignore: %d\n", System.currentTimeMillis() - start); } catch (Exception e) { e.printStackTrace(); throw e; } finally { fs.close(); } }