Example usage for org.apache.hadoop.fs FileSystem makeQualified

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem makeQualified.

Prototype

public Path makeQualified(Path path)

Source Link

Document

Qualify a path to one which uses this FileSystem and, if relative, made absolute.

Usage

From source file:org.apache.mahout.clustering.spectral.VectorMatrixMultiplicationJob.java

License:Apache License

public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath)
        throws IOException, ClassNotFoundException, InterruptedException {

    // set up the serialization of the diagonal vector
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(markovPath.toUri(), conf);
    markovPath = fs.makeQualified(markovPath);
    outputPath = fs.makeQualified(outputPath);
    Path vectorOutputPath = new Path(outputPath.getParent(), "vector");
    VectorCache.save(new IntWritable(Keys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf);

    // set up the job itself
    Job job = new Job(conf, "VectorMatrixMultiplication");
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(VectorMatrixMultiplicationMapper.class);
    job.setNumReduceTasks(0);//  w  w w. j  a  v a 2s.co m

    FileInputFormat.addInputPath(job, markovPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setJarByClass(VectorMatrixMultiplicationJob.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    // build the resulting DRM from the results
    return new DistributedRowMatrix(outputPath, tmpPath, diag.size(), diag.size());
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * Generates the fList from the serialized string representation
 * //from w  w  w  .ja va2 s  . co  m
 * @return Deserialized Feature Frequency List
 */
public static OpenObjectLongHashMap<String> readOlderCachedFLists(Configuration conf, long currWindowStart,
        TimeWeightFunction weightFunction) throws IOException {
    OpenObjectLongHashMap<String> list = new OpenObjectLongHashMap<String>();
    Path[] files = DistributedCache.getLocalCacheFiles(conf);
    if (files == null) {
        throw new IOException("Cannot read Frequency list from Distributed Cache");
    }
    for (int i = 0; i < files.length; ++i) {
        FileSystem fs = FileSystem.getLocal(conf);
        Path fListLocalPath = fs.makeQualified(files[i]);
        // Fallback if we are running locally.
        if (!fs.exists(fListLocalPath)) {
            URI[] filesURIs = DistributedCache.getCacheFiles(conf);
            if (filesURIs == null) {
                throw new IOException("Cannot read Frequency list from Distributed Cache");
            }
            fListLocalPath = new Path(filesURIs[i].getPath());
        }
        long listWindowStart = Long.parseLong(fListLocalPath.getParent().getParent().getName());
        for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath,
                true, conf)) {
            String token = record.getFirst().toString();

            list.put(token, Math.round(list.get(token)
                    + weightFunction.apply(record.getSecond().get(), listWindowStart, currWindowStart)));
        }
    }
    return list;
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * Generates the fList from the serialized string representation
 * /*  www.java 2  s  .c  o  m*/
 * @return Deserialized Feature Frequency List
 */
public static List<Pair<String, Long>> readCachedFList(Configuration conf) throws IOException {
    List<Pair<String, Long>> list = new ArrayList<Pair<String, Long>>();
    Path[] files = DistributedCache.getLocalCacheFiles(conf);
    if (files == null) {
        throw new IOException("Cannot read Frequency list from Distributed Cache");
    }
    if (files.length != 1) {
        throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")");
    }
    FileSystem fs = FileSystem.getLocal(conf);
    Path fListLocalPath = fs.makeQualified(files[0]);
    // Fallback if we are running locally.
    if (!fs.exists(fListLocalPath)) {
        URI[] filesURIs = DistributedCache.getCacheFiles(conf);
        if (filesURIs == null) {
            throw new IOException("Cannot read Frequency list from Distributed Cache");
        }
        if (filesURIs.length != 1) {
            throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")");
        }
        fListLocalPath = new Path(filesURIs[0].getPath());
    }
    // Done below, while caching the list
    // // YA: Lang independent stop words removal
    // // FIXMENOT: as below
    // Parameters params = new Parameters(conf.get(PFP_PARAMETERS, ""));
    // int minFr = params.getInt(MIN_FREQ, MIN_FREQ_DEFAULT);
    // int prunePct = params.getInt(PRUNE_PCTILE, PRUNE_PCTILE_DEFAULT);
    //
    // // TODONOT: assert minFr >= minSupport;
    //
    // Iterator<Pair<Text, LongWritable>> tempIter = new SequenceFileIterable<Text, LongWritable>(
    // fListLocalPath, true, conf).iterator();
    // long maxFr = Long.MAX_VALUE;
    // if (tempIter.hasNext()) {
    // maxFr = tempIter.next().getSecond().get() * prunePct / 100;
    // }
    // tempIter = null;
    //
    // for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(
    // fListLocalPath, true, conf)) {
    // String token = record.getFirst().toString();
    // char ch0 = token.charAt(0);
    // if ((ch0 != '#' && ch0 != '@')
    // && (record.getSecond().get() < minFr || record.getSecond().get() > maxFr)) {
    // continue;
    // }
    // list.add(new Pair<String, Long>(token, record.getSecond().get()));
    // }
    // // END YA

    for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath, true,
            conf)) {
        list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get()));
    }

    return list;
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * Serializes the fList and returns the string representation of the List
 * /*from   www.ja va2s  .  c  om*/
 * @param flistPath
 * 
 * @return Serialized String representation of List
 */
public static void saveFList(Iterable<Pair<String, Long>> flist, // Parameters params,
        Configuration conf, Path flistPath) throws IOException {
    FileSystem fs = FileSystem.get(flistPath.toUri(), conf);
    flistPath = fs.makeQualified(flistPath);
    // HadoopUtil.delete(conf, flistPath);
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, LongWritable.class);
    try {
        for (Pair<String, Long> pair : flist) {
            writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond()));
        }
    } finally {
        writer.close();
    }
    DistributedCache.addCacheFile(flistPath.toUri(), conf);
}

From source file:org.apache.mahout.ga.watchmaker.cd.DataLineTest.java

License:Apache License

public void testSet() throws Exception {
    FileSystem fs = FileSystem.get(new Configuration());
    Path inpath = fs.makeQualified(new Path(Resources.getResource("wdbc").toString()));

    DataSet dataset = FileInfoParser.parseFile(fs, inpath);
    DataSet.initialize(dataset);//from  w w  w. j a v  a 2  s  .  c  o m

    DataLine dl = new DataLine();

    int labelpos = dataset.getLabelIndex();

    dl.set(datalines[0]);
    assertEquals(dataset.valueIndex(labelpos, "M"), dl.getLabel());

    dl.set(datalines[1]);
    assertEquals(dataset.valueIndex(labelpos, "B"), dl.getLabel());

    dl.set(datalines[2]);
    assertEquals(dataset.valueIndex(labelpos, "M"), dl.getLabel());
}

From source file:org.apache.mahout.ga.watchmaker.cd.FileInfosDatasetTest.java

License:Apache License

public void testRanges() throws IOException {
    FileSystem fs = FileSystem.get(new Configuration());
    Path inpath = fs.makeQualified(new Path(Resources.getResource("wdbc").toString()));

    DataSet dataset = FileInfoParser.parseFile(fs, inpath);
    DataSet.initialize(dataset);//  ww  w .  j a  v  a  2 s . c o  m

    DataLine dl = new DataLine();
    for (String line : new FileLineIterable(new File(Resources.getResource("wdbc/wdbc.data").getPath()))) {
        dl.set(line);
        for (int index = 0; index < dataset.getNbAttributes(); index++) {
            if (dataset.isNumerical(index)) {
                assertInRange(dl.getAttribute(index), dataset.getMin(index), dataset.getMax(index));
            } else {
                assertInRange(dl.getAttribute(index), 0, dataset.getNbValues(index));
            }
        }
    }
}

From source file:org.apache.mahout.ga.watchmaker.cd.hadoop.CDMahoutEvaluatorTest.java

License:Apache License

public void testEvaluate() throws Exception {
    int nbrules = 100;
    Random rng = RandomUtils.getRandom();
    int target = 1;

    // random rules
    List<Rule> rules = new ArrayList<Rule>();
    for (int index = 0; index < nbrules; index++) {
        rules.add(new RandomRule(index, target, rng));
    }// w  w  w  .  jav a 2  s  . c om

    // dataset
    // This is sensitive to the working directory where the test is run:
    FileSystem fs = FileSystem.get(new Configuration());
    Path input = fs.makeQualified(new Path(Resources.getResource("wdbc").toString()));
    CDMahoutEvaluator.initializeDataSet(input);

    // evaluate the rules
    List<CDFitness> results = new ArrayList<CDFitness>();
    Path output = getTestTempDirPath("output");
    fs = output.getFileSystem(new Configuration());
    fs.delete(output, true); // It's unhappy if this directory exists
    CDMahoutEvaluator.evaluate(rules, target, input, output, results);

    // check the results
    for (int index = 0; index < nbrules; index++) {
        assertEquals("rule " + index, RandomRuleResults.getResult(index), results.get(index));
    }

}

From source file:org.apache.mahout.math.hadoop.TimesSquaredJob.java

License:Apache License

public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim,
        Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass,
        Class<? extends VectorSummingReducer> redClass) throws IOException {

    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    outputVectorPathBase = fs.makeQualified(outputVectorPathBase);

    long now = System.nanoTime();
    Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now);

    SequenceFile.Writer inputVectorPathWriter = null;

    try {/*from   www.j a  va 2  s.  c om*/
        inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class,
                VectorWritable.class);
        inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v));
    } finally {
        Closeables.close(inputVectorPathWriter, false);
    }

    URI ivpURI = inputVectorPath.toUri();
    DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf);

    Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME),
            SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass,
            NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf);
    job.setCombinerClass(redClass);
    job.setJobName("TimesSquaredJob: " + matrixInputPath);

    Configuration conf = job.getConfiguration();
    conf.set(INPUT_VECTOR, ivpURI.toString());
    conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense());
    conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim);

    return job;
}

From source file:org.apache.nifi.processors.hadoop.KeyValueReader.java

License:Apache License

@Override
public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem)
        throws IOException {

    final SequenceFile.Reader reader;

    Set<FlowFile> flowFiles = new HashSet<>();
    reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file)));
    final Text key = new Text();
    final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader);
    final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
    int counter = 0;
    LOG.debug("Read from SequenceFile: {} ", new Object[] { file });
    try {// ww w  .  j a va  2s.com
        while (reader.next(key)) {
            String fileName = key.toString();
            // the key may be a file name, and may not
            if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
                if (fileName.contains(File.separator)) {
                    fileName = StringUtils.substringAfterLast(fileName, File.separator);
                }
                fileName = fileName + "." + System.nanoTime();
            } else {
                fileName = inputfileName + ++counter;
            }

            FlowFile flowFile = session.create();
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
            callback.key = key;
            try {
                flowFile = session.write(flowFile, callback);
                flowFiles.add(flowFile);
            } catch (ProcessException e) {
                LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e);
                session.remove(flowFile);
            }
            key.clear();
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }

    return flowFiles;
}

From source file:org.apache.nifi.processors.hadoop.ValueReader.java

License:Apache License

@Override
public Set<FlowFile> readSequenceFile(final Path file, Configuration configuration, FileSystem fileSystem)
        throws IOException {

    Set<FlowFile> flowFiles = new HashSet<>();
    final SequenceFile.Reader reader = new SequenceFile.Reader(configuration,
            Reader.file(fileSystem.makeQualified(file)));
    final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
    int counter = 0;
    LOG.debug("Reading from sequence file {}", new Object[] { file });
    final OutputStreamWritableCallback writer = new OutputStreamWritableCallback(reader);
    Text key = new Text();
    try {//from   ww  w  .ja v  a 2 s . co  m
        while (reader.next(key)) {
            String fileName = key.toString();
            // the key may be a file name, and may not
            if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
                if (fileName.contains(File.separator)) {
                    fileName = StringUtils.substringAfterLast(fileName, File.separator);
                }
                fileName = fileName + "." + System.nanoTime();
            } else {
                fileName = inputfileName + ++counter;
            }
            FlowFile flowFile = session.create();
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
            try {
                flowFile = session.write(flowFile, writer);
                flowFiles.add(flowFile);
            } catch (ProcessException e) {
                LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e);
                session.remove(flowFile);
            }
            key.clear();
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }

    return flowFiles;
}