List of usage examples for org.apache.hadoop.fs FileSystem makeQualified
public Path makeQualified(Path path)
From source file:org.apache.mahout.clustering.spectral.VectorMatrixMultiplicationJob.java
License:Apache License
public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath) throws IOException, ClassNotFoundException, InterruptedException { // set up the serialization of the diagonal vector Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(markovPath.toUri(), conf); markovPath = fs.makeQualified(markovPath); outputPath = fs.makeQualified(outputPath); Path vectorOutputPath = new Path(outputPath.getParent(), "vector"); VectorCache.save(new IntWritable(Keys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf); // set up the job itself Job job = new Job(conf, "VectorMatrixMultiplication"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(VectorMatrixMultiplicationMapper.class); job.setNumReduceTasks(0);// w w w. j a v a 2s.co m FileInputFormat.addInputPath(job, markovPath); FileOutputFormat.setOutputPath(job, outputPath); job.setJarByClass(VectorMatrixMultiplicationJob.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } // build the resulting DRM from the results return new DistributedRowMatrix(outputPath, tmpPath, diag.size(), diag.size()); }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * Generates the fList from the serialized string representation * //from w w w .ja va2 s . co m * @return Deserialized Feature Frequency List */ public static OpenObjectLongHashMap<String> readOlderCachedFLists(Configuration conf, long currWindowStart, TimeWeightFunction weightFunction) throws IOException { OpenObjectLongHashMap<String> list = new OpenObjectLongHashMap<String>(); Path[] files = DistributedCache.getLocalCacheFiles(conf); if (files == null) { throw new IOException("Cannot read Frequency list from Distributed Cache"); } for (int i = 0; i < files.length; ++i) { FileSystem fs = FileSystem.getLocal(conf); Path fListLocalPath = fs.makeQualified(files[i]); // Fallback if we are running locally. if (!fs.exists(fListLocalPath)) { URI[] filesURIs = DistributedCache.getCacheFiles(conf); if (filesURIs == null) { throw new IOException("Cannot read Frequency list from Distributed Cache"); } fListLocalPath = new Path(filesURIs[i].getPath()); } long listWindowStart = Long.parseLong(fListLocalPath.getParent().getParent().getName()); for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath, true, conf)) { String token = record.getFirst().toString(); list.put(token, Math.round(list.get(token) + weightFunction.apply(record.getSecond().get(), listWindowStart, currWindowStart))); } } return list; }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * Generates the fList from the serialized string representation * /* www.java 2 s .c o m*/ * @return Deserialized Feature Frequency List */ public static List<Pair<String, Long>> readCachedFList(Configuration conf) throws IOException { List<Pair<String, Long>> list = new ArrayList<Pair<String, Long>>(); Path[] files = DistributedCache.getLocalCacheFiles(conf); if (files == null) { throw new IOException("Cannot read Frequency list from Distributed Cache"); } if (files.length != 1) { throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")"); } FileSystem fs = FileSystem.getLocal(conf); Path fListLocalPath = fs.makeQualified(files[0]); // Fallback if we are running locally. if (!fs.exists(fListLocalPath)) { URI[] filesURIs = DistributedCache.getCacheFiles(conf); if (filesURIs == null) { throw new IOException("Cannot read Frequency list from Distributed Cache"); } if (filesURIs.length != 1) { throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")"); } fListLocalPath = new Path(filesURIs[0].getPath()); } // Done below, while caching the list // // YA: Lang independent stop words removal // // FIXMENOT: as below // Parameters params = new Parameters(conf.get(PFP_PARAMETERS, "")); // int minFr = params.getInt(MIN_FREQ, MIN_FREQ_DEFAULT); // int prunePct = params.getInt(PRUNE_PCTILE, PRUNE_PCTILE_DEFAULT); // // // TODONOT: assert minFr >= minSupport; // // Iterator<Pair<Text, LongWritable>> tempIter = new SequenceFileIterable<Text, LongWritable>( // fListLocalPath, true, conf).iterator(); // long maxFr = Long.MAX_VALUE; // if (tempIter.hasNext()) { // maxFr = tempIter.next().getSecond().get() * prunePct / 100; // } // tempIter = null; // // for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>( // fListLocalPath, true, conf)) { // String token = record.getFirst().toString(); // char ch0 = token.charAt(0); // if ((ch0 != '#' && ch0 != '@') // && (record.getSecond().get() < minFr || record.getSecond().get() > maxFr)) { // continue; // } // list.add(new Pair<String, Long>(token, record.getSecond().get())); // } // // END YA for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath, true, conf)) { list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get())); } return list; }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * Serializes the fList and returns the string representation of the List * /*from www.ja va2s . c om*/ * @param flistPath * * @return Serialized String representation of List */ public static void saveFList(Iterable<Pair<String, Long>> flist, // Parameters params, Configuration conf, Path flistPath) throws IOException { FileSystem fs = FileSystem.get(flistPath.toUri(), conf); flistPath = fs.makeQualified(flistPath); // HadoopUtil.delete(conf, flistPath); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, LongWritable.class); try { for (Pair<String, Long> pair : flist) { writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond())); } } finally { writer.close(); } DistributedCache.addCacheFile(flistPath.toUri(), conf); }
From source file:org.apache.mahout.ga.watchmaker.cd.DataLineTest.java
License:Apache License
public void testSet() throws Exception { FileSystem fs = FileSystem.get(new Configuration()); Path inpath = fs.makeQualified(new Path(Resources.getResource("wdbc").toString())); DataSet dataset = FileInfoParser.parseFile(fs, inpath); DataSet.initialize(dataset);//from w w w. j a v a 2 s . c o m DataLine dl = new DataLine(); int labelpos = dataset.getLabelIndex(); dl.set(datalines[0]); assertEquals(dataset.valueIndex(labelpos, "M"), dl.getLabel()); dl.set(datalines[1]); assertEquals(dataset.valueIndex(labelpos, "B"), dl.getLabel()); dl.set(datalines[2]); assertEquals(dataset.valueIndex(labelpos, "M"), dl.getLabel()); }
From source file:org.apache.mahout.ga.watchmaker.cd.FileInfosDatasetTest.java
License:Apache License
public void testRanges() throws IOException { FileSystem fs = FileSystem.get(new Configuration()); Path inpath = fs.makeQualified(new Path(Resources.getResource("wdbc").toString())); DataSet dataset = FileInfoParser.parseFile(fs, inpath); DataSet.initialize(dataset);// ww w . j a v a 2 s . c o m DataLine dl = new DataLine(); for (String line : new FileLineIterable(new File(Resources.getResource("wdbc/wdbc.data").getPath()))) { dl.set(line); for (int index = 0; index < dataset.getNbAttributes(); index++) { if (dataset.isNumerical(index)) { assertInRange(dl.getAttribute(index), dataset.getMin(index), dataset.getMax(index)); } else { assertInRange(dl.getAttribute(index), 0, dataset.getNbValues(index)); } } } }
From source file:org.apache.mahout.ga.watchmaker.cd.hadoop.CDMahoutEvaluatorTest.java
License:Apache License
public void testEvaluate() throws Exception { int nbrules = 100; Random rng = RandomUtils.getRandom(); int target = 1; // random rules List<Rule> rules = new ArrayList<Rule>(); for (int index = 0; index < nbrules; index++) { rules.add(new RandomRule(index, target, rng)); }// w w w . jav a 2 s . c om // dataset // This is sensitive to the working directory where the test is run: FileSystem fs = FileSystem.get(new Configuration()); Path input = fs.makeQualified(new Path(Resources.getResource("wdbc").toString())); CDMahoutEvaluator.initializeDataSet(input); // evaluate the rules List<CDFitness> results = new ArrayList<CDFitness>(); Path output = getTestTempDirPath("output"); fs = output.getFileSystem(new Configuration()); fs.delete(output, true); // It's unhappy if this directory exists CDMahoutEvaluator.evaluate(rules, target, input, output, results); // check the results for (int index = 0; index < nbrules; index++) { assertEquals("rule " + index, RandomRuleResults.getResult(index), results.get(index)); } }
From source file:org.apache.mahout.math.hadoop.TimesSquaredJob.java
License:Apache License
public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim, Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass, Class<? extends VectorSummingReducer> redClass) throws IOException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); long now = System.nanoTime(); Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now); SequenceFile.Writer inputVectorPathWriter = null; try {/*from www.j a va 2 s. c om*/ inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class); inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v)); } finally { Closeables.close(inputVectorPathWriter, false); } URI ivpURI = inputVectorPath.toUri(); DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf); Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME), SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(redClass); job.setJobName("TimesSquaredJob: " + matrixInputPath); Configuration conf = job.getConfiguration(); conf.set(INPUT_VECTOR, ivpURI.toString()); conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense()); conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim); return job; }
From source file:org.apache.nifi.processors.hadoop.KeyValueReader.java
License:Apache License
@Override public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem) throws IOException { final SequenceFile.Reader reader; Set<FlowFile> flowFiles = new HashSet<>(); reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file))); final Text key = new Text(); final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader); final String inputfileName = file.getName() + "." + System.nanoTime() + "."; int counter = 0; LOG.debug("Read from SequenceFile: {} ", new Object[] { file }); try {// ww w . j a va 2s.com while (reader.next(key)) { String fileName = key.toString(); // the key may be a file name, and may not if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) { if (fileName.contains(File.separator)) { fileName = StringUtils.substringAfterLast(fileName, File.separator); } fileName = fileName + "." + System.nanoTime(); } else { fileName = inputfileName + ++counter; } FlowFile flowFile = session.create(); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName); callback.key = key; try { flowFile = session.write(flowFile, callback); flowFiles.add(flowFile); } catch (ProcessException e) { LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e); session.remove(flowFile); } key.clear(); } } finally { IOUtils.closeQuietly(reader); } return flowFiles; }
From source file:org.apache.nifi.processors.hadoop.ValueReader.java
License:Apache License
@Override public Set<FlowFile> readSequenceFile(final Path file, Configuration configuration, FileSystem fileSystem) throws IOException { Set<FlowFile> flowFiles = new HashSet<>(); final SequenceFile.Reader reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file))); final String inputfileName = file.getName() + "." + System.nanoTime() + "."; int counter = 0; LOG.debug("Reading from sequence file {}", new Object[] { file }); final OutputStreamWritableCallback writer = new OutputStreamWritableCallback(reader); Text key = new Text(); try {//from ww w .ja v a 2 s . co m while (reader.next(key)) { String fileName = key.toString(); // the key may be a file name, and may not if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) { if (fileName.contains(File.separator)) { fileName = StringUtils.substringAfterLast(fileName, File.separator); } fileName = fileName + "." + System.nanoTime(); } else { fileName = inputfileName + ++counter; } FlowFile flowFile = session.create(); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName); try { flowFile = session.write(flowFile, writer); flowFiles.add(flowFile); } catch (ProcessException e) { LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e); session.remove(flowFile); } key.clear(); } } finally { IOUtils.closeQuietly(reader); } return flowFiles; }