List of usage examples for org.apache.hadoop.fs FileSystem makeQualified
public Path makeQualified(Path path)
From source file:org.apache.lens.driver.hive.TestHiveDriver.java
License:Apache License
/** * Validate persistent result./*from w w w.j av a 2 s .c o m*/ * * @param resultSet the result set * @param dataFile the data file * @param outptuDir the outptu dir * @param formatNulls the format nulls * @throws Exception the exception */ private void validatePersistentResult(LensResultSet resultSet, String dataFile, Path outptuDir, boolean formatNulls) throws Exception { assertTrue(resultSet instanceof HivePersistentResultSet, "resultset class: " + resultSet.getClass().getName()); HivePersistentResultSet persistentResultSet = (HivePersistentResultSet) resultSet; String path = persistentResultSet.getOutputPath(); Path actualPath = new Path(path); FileSystem fs = actualPath.getFileSystem(driverConf); assertEquals(actualPath, fs.makeQualified(outptuDir)); List<String> actualRows = new ArrayList<String>(); for (FileStatus stat : fs.listStatus(actualPath, new PathFilter() { @Override public boolean accept(Path path) { return !new File(path.toUri()).isDirectory(); } })) { FSDataInputStream in = fs.open(stat.getPath()); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(in)); String line = ""; while ((line = br.readLine()) != null) { System.out.println("Actual:" + line); actualRows.add(line.trim()); } } finally { if (br != null) { br.close(); } } } BufferedReader br = null; List<String> expectedRows = new ArrayList<String>(); try { br = new BufferedReader(new FileReader(new File(dataFile))); String line = ""; while ((line = br.readLine()) != null) { String row = line.trim(); if (formatNulls) { row += ",-NA-,"; row += line.trim(); } expectedRows.add(row); } } finally { if (br != null) { br.close(); } } assertEquals(actualRows, expectedRows); }
From source file:org.apache.mahout.cf.taste.example.email.EmailUtility.java
License:Apache License
public static void loadDictionaries(Configuration conf, String fromPrefix, OpenObjectIntHashMap<String> fromDictionary, String msgIdPrefix, OpenObjectIntHashMap<String> msgIdDictionary) throws IOException { Path[] localFiles = HadoopUtil.getCachedFiles(conf); FileSystem fs = FileSystem.getLocal(conf); for (Path dictionaryFile : localFiles) { // key is word value is id OpenObjectIntHashMap<String> dictionary = null; if (dictionaryFile.getName().startsWith(fromPrefix)) { dictionary = fromDictionary; } else if (dictionaryFile.getName().startsWith(msgIdPrefix)) { dictionary = msgIdDictionary; }//from www .ja v a 2 s . c om if (dictionary != null) { dictionaryFile = fs.makeQualified(dictionaryFile); for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>( dictionaryFile, true, conf)) { dictionary.put(record.getFirst().toString(), record.getSecond().get()); } } } }
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java
License:Apache License
public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri()); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); log.info("Create Hmm Model. File System = {}", fs); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter());//from w ww .ja v a 2 s . c o m for (FileStatus match : matches) { log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString()); result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); log.info("CreateHmmModel Matching Seq File Key = {}", key); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Initial Prob Adding Key, Value = ({} {})", ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc // the number after _ is the state ID at char number 11 int stateID = Character.getNumericValue(key.charAt(8)); log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc // the number after _ is the state ID at char number 5 int stateID = Character.getNumericValue(key.charAt(5)); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); HmmUtils.validate(model); return model; }
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.MapWritableCache.java
License:Apache License
/** * @param key SequenceFile key/* w w w. jav a2 s .co m*/ * @param map Map to save */ public static void save(Writable key, MapWritable map, Path output, Configuration conf, boolean overwritePath, boolean deleteOnExit) throws IOException { FileSystem fs = FileSystem.get(conf); output = fs.makeQualified(output); if (overwritePath) { HadoopUtil.delete(conf, output); } // set the cache DistributedCache.setCacheFiles(new URI[] { output.toUri() }, conf); // set up the writer SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, IntWritable.class, MapWritable.class); try { writer.append(key, new MapWritable(map)); } finally { Closeables.closeQuietly(writer); } if (deleteOnExit) { fs.deleteOnExit(output); } }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java
License:Apache License
/** * Converts the sequence files present in a directory to a {@link HmmModel} model. * * @param nrOfHiddenStates Number of hidden states * @param nrOfOutputStates Number of output states * @param modelPath Location of the sequence files containing the model's distributions * @param conf Configuration object * @return HmmModel the encoded model//w w w .ja va2s . c om * @throws IOException */ public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter()); for (FileStatus match : matches) { result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == (int) 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); if (model != null) { return model; } else throw new IOException("Error building model from output location"); }
From source file:org.apache.mahout.clustering.spectral.common.TestVectorCache.java
License:Apache License
@Test public void testLoad() throws Exception { // save a vector manually Configuration conf = new Configuration(); Writable key = new IntWritable(0); Vector value = new DenseVector(VECTOR); Path path = getTestTempDirPath("output"); FileSystem fs = FileSystem.get(path.toUri(), conf); // write the vector path = fs.makeQualified(path); fs.deleteOnExit(path);//from ww w . java2 s . co m HadoopUtil.delete(conf, path); DistributedCache.setCacheFiles(new URI[] { path.toUri() }, conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class); try { writer.append(key, new VectorWritable(value)); } finally { Closeables.closeQuietly(writer); } // load it Vector result = VectorCache.load(conf); // are they the same? assertNotNull("Vector is not null", result); assertEquals("Loaded vector is identical to original", result, value); }
From source file:org.apache.mahout.clustering.spectral.common.VectorCache.java
License:Apache License
/** * //from w w w. j a v a 2 s . co m * @param key SequenceFile key * @param vector Vector to save, to be wrapped as VectorWritable */ public static void save(Writable key, Vector vector, Path output, Configuration conf, boolean overwritePath, boolean deleteOnExit) throws IOException { FileSystem fs = FileSystem.get(output.toUri(), conf); output = fs.makeQualified(output); if (overwritePath) { HadoopUtil.delete(conf, output); } // set the cache DistributedCache.setCacheFiles(new URI[] { output.toUri() }, conf); // set up the writer SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, IntWritable.class, VectorWritable.class); try { writer.append(key, new VectorWritable(vector)); } finally { Closeables.closeQuietly(writer); } if (deleteOnExit) { fs.deleteOnExit(output); } }
From source file:org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob.java
License:Apache License
public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath) throws IOException, ClassNotFoundException, InterruptedException { // set up the serialization of the diagonal vector Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(markovPath.toUri(), conf); markovPath = fs.makeQualified(markovPath); outputPath = fs.makeQualified(outputPath); Path vectorOutputPath = new Path(outputPath.getParent(), "vector"); VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf); // set up the job itself Job job = new Job(conf, "VectorMatrixMultiplication"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(VectorMatrixMultiplicationMapper.class); job.setNumReduceTasks(0);//from www.j a v a 2 s.c o m FileInputFormat.addInputPath(job, markovPath); FileOutputFormat.setOutputPath(job, outputPath); job.setJarByClass(VectorMatrixMultiplicationJob.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } // build the resulting DRM from the results return new DistributedRowMatrix(outputPath, tmpPath, diag.size(), diag.size()); }
From source file:org.apache.mahout.clustering.spectral.TestVectorCache.java
License:Apache License
@Test public void testLoad() throws Exception { // save a vector manually Configuration conf = getConfiguration(); Writable key = new IntWritable(0); Vector value = new DenseVector(VECTOR); Path path = getTestTempDirPath("output"); FileSystem fs = FileSystem.get(path.toUri(), conf); // write the vector path = fs.makeQualified(path); fs.deleteOnExit(path);// w ww.jav a 2 s . c o m HadoopUtil.delete(conf, path); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class); try { writer.append(key, new VectorWritable(value)); } finally { Closeables.close(writer, false); } DistributedCache.setCacheFiles(new URI[] { path.toUri() }, conf); // load it Vector result = VectorCache.load(conf); // are they the same? assertNotNull("Vector is null", result); assertEquals("Loaded vector is not identical to original", result, value); }
From source file:org.apache.mahout.clustering.spectral.VectorCache.java
License:Apache License
/** * @param key SequenceFile key/*from ww w. ja v a2s. c om*/ * @param vector Vector to save, to be wrapped as VectorWritable */ public static void save(Writable key, Vector vector, Path output, Configuration conf, boolean overwritePath, boolean deleteOnExit) throws IOException { FileSystem fs = FileSystem.get(output.toUri(), conf); output = fs.makeQualified(output); if (overwritePath) { HadoopUtil.delete(conf, output); } // set the cache DistributedCache.setCacheFiles(new URI[] { output.toUri() }, conf); // set up the writer SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, IntWritable.class, VectorWritable.class); try { writer.append(key, new VectorWritable(vector)); } finally { Closeables.close(writer, false); } if (deleteOnExit) { fs.deleteOnExit(output); } }