Example usage for org.apache.hadoop.fs FileSystem makeQualified

List of usage examples for org.apache.hadoop.fs FileSystem makeQualified

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem makeQualified.

Prototype

public Path makeQualified(Path path) 

Source Link

Document

Qualify a path to one which uses this FileSystem and, if relative, made absolute.

Usage

From source file:org.apache.mahout.clustering.spectral.VectorMatrixMultiplicationJob.java

License:Apache License

public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath)
        throws IOException, ClassNotFoundException, InterruptedException {

    // set up the serialization of the diagonal vector
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(markovPath.toUri(), conf);
    markovPath = fs.makeQualified(markovPath);
    outputPath = fs.makeQualified(outputPath);
    Path vectorOutputPath = new Path(outputPath.getParent(), "vector");
    VectorCache.save(new IntWritable(Keys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf);

    // set up the job itself
    Job job = new Job(conf, "VectorMatrixMultiplication");
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(VectorMatrixMultiplicationMapper.class);
    job.setNumReduceTasks(0);//  w  w w. j  a  v a 2s.co m

    FileInputFormat.addInputPath(job, markovPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setJarByClass(VectorMatrixMultiplicationJob.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    // build the resulting DRM from the results
    return new DistributedRowMatrix(outputPath, tmpPath, diag.size(), diag.size());
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * Generates the fList from the serialized string representation
 * //from w  w  w  .ja va2 s  . co  m
 * @return Deserialized Feature Frequency List
 */
public static OpenObjectLongHashMap<String> readOlderCachedFLists(Configuration conf, long currWindowStart,
        TimeWeightFunction weightFunction) throws IOException {
    OpenObjectLongHashMap<String> list = new OpenObjectLongHashMap<String>();
    Path[] files = DistributedCache.getLocalCacheFiles(conf);
    if (files == null) {
        throw new IOException("Cannot read Frequency list from Distributed Cache");
    }
    for (int i = 0; i < files.length; ++i) {
        FileSystem fs = FileSystem.getLocal(conf);
        Path fListLocalPath = fs.makeQualified(files[i]);
        // Fallback if we are running locally.
        if (!fs.exists(fListLocalPath)) {
            URI[] filesURIs = DistributedCache.getCacheFiles(conf);
            if (filesURIs == null) {
                throw new IOException("Cannot read Frequency list from Distributed Cache");
            }
            fListLocalPath = new Path(filesURIs[i].getPath());
        }
        long listWindowStart = Long.parseLong(fListLocalPath.getParent().getParent().getName());
        for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath,
                true, conf)) {
            String token = record.getFirst().toString();

            list.put(token, Math.round(list.get(token)
                    + weightFunction.apply(record.getSecond().get(), listWindowStart, currWindowStart)));
        }
    }
    return list;
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * Generates the fList from the serialized string representation
 * /*  www.java 2  s  .c  o  m*/
 * @return Deserialized Feature Frequency List
 */
public static List<Pair<String, Long>> readCachedFList(Configuration conf) throws IOException {
    List<Pair<String, Long>> list = new ArrayList<Pair<String, Long>>();
    Path[] files = DistributedCache.getLocalCacheFiles(conf);
    if (files == null) {
        throw new IOException("Cannot read Frequency list from Distributed Cache");
    }
    if (files.length != 1) {
        throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")");
    }
    FileSystem fs = FileSystem.getLocal(conf);
    Path fListLocalPath = fs.makeQualified(files[0]);
    // Fallback if we are running locally.
    if (!fs.exists(fListLocalPath)) {
        URI[] filesURIs = DistributedCache.getCacheFiles(conf);
        if (filesURIs == null) {
            throw new IOException("Cannot read Frequency list from Distributed Cache");
        }
        if (filesURIs.length != 1) {
            throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")");
        }
        fListLocalPath = new Path(filesURIs[0].getPath());
    }
    // Done below, while caching the list
    // // YA: Lang independent stop words removal
    // // FIXMENOT: as below
    // Parameters params = new Parameters(conf.get(PFP_PARAMETERS, ""));
    // int minFr = params.getInt(MIN_FREQ, MIN_FREQ_DEFAULT);
    // int prunePct = params.getInt(PRUNE_PCTILE, PRUNE_PCTILE_DEFAULT);
    //
    // // TODONOT: assert minFr >= minSupport;
    //
    // Iterator<Pair<Text, LongWritable>> tempIter = new SequenceFileIterable<Text, LongWritable>(
    // fListLocalPath, true, conf).iterator();
    // long maxFr = Long.MAX_VALUE;
    // if (tempIter.hasNext()) {
    // maxFr = tempIter.next().getSecond().get() * prunePct / 100;
    // }
    // tempIter = null;
    //
    // for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(
    // fListLocalPath, true, conf)) {
    // String token = record.getFirst().toString();
    // char ch0 = token.charAt(0);
    // if ((ch0 != '#' && ch0 != '@')
    // && (record.getSecond().get() < minFr || record.getSecond().get() > maxFr)) {
    // continue;
    // }
    // list.add(new Pair<String, Long>(token, record.getSecond().get()));
    // }
    // // END YA

    for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath, true,
            conf)) {
        list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get()));
    }

    return list;
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * Serializes the fList and returns the string representation of the List
 * /*from   www.ja va2s  .  c  om*/
 * @param flistPath
 * 
 * @return Serialized String representation of List
 */
public static void saveFList(Iterable<Pair<String, Long>> flist, // Parameters params,
        Configuration conf, Path flistPath) throws IOException {
    FileSystem fs = FileSystem.get(flistPath.toUri(), conf);
    flistPath = fs.makeQualified(flistPath);
    // HadoopUtil.delete(conf, flistPath);
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, LongWritable.class);
    try {
        for (Pair<String, Long> pair : flist) {
            writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond()));
        }
    } finally {
        writer.close();
    }
    DistributedCache.addCacheFile(flistPath.toUri(), conf);
}

From source file:org.apache.mahout.ga.watchmaker.cd.DataLineTest.java

License:Apache License

public void testSet() throws Exception {
    FileSystem fs = FileSystem.get(new Configuration());
    Path inpath = fs.makeQualified(new Path(Resources.getResource("wdbc").toString()));

    DataSet dataset = FileInfoParser.parseFile(fs, inpath);
    DataSet.initialize(dataset);//from  w w  w. j a v  a 2  s  .  c  o m

    DataLine dl = new DataLine();

    int labelpos = dataset.getLabelIndex();

    dl.set(datalines[0]);
    assertEquals(dataset.valueIndex(labelpos, "M"), dl.getLabel());

    dl.set(datalines[1]);
    assertEquals(dataset.valueIndex(labelpos, "B"), dl.getLabel());

    dl.set(datalines[2]);
    assertEquals(dataset.valueIndex(labelpos, "M"), dl.getLabel());
}

From source file:org.apache.mahout.ga.watchmaker.cd.FileInfosDatasetTest.java

License:Apache License

public void testRanges() throws IOException {
    FileSystem fs = FileSystem.get(new Configuration());
    Path inpath = fs.makeQualified(new Path(Resources.getResource("wdbc").toString()));

    DataSet dataset = FileInfoParser.parseFile(fs, inpath);
    DataSet.initialize(dataset);//  ww  w .  j a  v  a  2 s . c o  m

    DataLine dl = new DataLine();
    for (String line : new FileLineIterable(new File(Resources.getResource("wdbc/wdbc.data").getPath()))) {
        dl.set(line);
        for (int index = 0; index < dataset.getNbAttributes(); index++) {
            if (dataset.isNumerical(index)) {
                assertInRange(dl.getAttribute(index), dataset.getMin(index), dataset.getMax(index));
            } else {
                assertInRange(dl.getAttribute(index), 0, dataset.getNbValues(index));
            }
        }
    }
}

From source file:org.apache.mahout.ga.watchmaker.cd.hadoop.CDMahoutEvaluatorTest.java

License:Apache License

public void testEvaluate() throws Exception {
    int nbrules = 100;
    Random rng = RandomUtils.getRandom();
    int target = 1;

    // random rules
    List<Rule> rules = new ArrayList<Rule>();
    for (int index = 0; index < nbrules; index++) {
        rules.add(new RandomRule(index, target, rng));
    }// w  w  w  .  jav a 2  s  . c om

    // dataset
    // This is sensitive to the working directory where the test is run:
    FileSystem fs = FileSystem.get(new Configuration());
    Path input = fs.makeQualified(new Path(Resources.getResource("wdbc").toString()));
    CDMahoutEvaluator.initializeDataSet(input);

    // evaluate the rules
    List<CDFitness> results = new ArrayList<CDFitness>();
    Path output = getTestTempDirPath("output");
    fs = output.getFileSystem(new Configuration());
    fs.delete(output, true); // It's unhappy if this directory exists
    CDMahoutEvaluator.evaluate(rules, target, input, output, results);

    // check the results
    for (int index = 0; index < nbrules; index++) {
        assertEquals("rule " + index, RandomRuleResults.getResult(index), results.get(index));
    }

}

From source file:org.apache.mahout.math.hadoop.TimesSquaredJob.java

License:Apache License

public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim,
        Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass,
        Class<? extends VectorSummingReducer> redClass) throws IOException {

    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    outputVectorPathBase = fs.makeQualified(outputVectorPathBase);

    long now = System.nanoTime();
    Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now);

    SequenceFile.Writer inputVectorPathWriter = null;

    try {/*from   www.j a  va 2  s.  c om*/
        inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class,
                VectorWritable.class);
        inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v));
    } finally {
        Closeables.close(inputVectorPathWriter, false);
    }

    URI ivpURI = inputVectorPath.toUri();
    DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf);

    Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME),
            SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass,
            NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf);
    job.setCombinerClass(redClass);
    job.setJobName("TimesSquaredJob: " + matrixInputPath);

    Configuration conf = job.getConfiguration();
    conf.set(INPUT_VECTOR, ivpURI.toString());
    conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense());
    conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim);

    return job;
}

From source file:org.apache.nifi.processors.hadoop.KeyValueReader.java

License:Apache License

@Override
public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem)
        throws IOException {

    final SequenceFile.Reader reader;

    Set<FlowFile> flowFiles = new HashSet<>();
    reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file)));
    final Text key = new Text();
    final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader);
    final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
    int counter = 0;
    LOG.debug("Read from SequenceFile: {} ", new Object[] { file });
    try {// ww w  .  j a va  2s.com
        while (reader.next(key)) {
            String fileName = key.toString();
            // the key may be a file name, and may not
            if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
                if (fileName.contains(File.separator)) {
                    fileName = StringUtils.substringAfterLast(fileName, File.separator);
                }
                fileName = fileName + "." + System.nanoTime();
            } else {
                fileName = inputfileName + ++counter;
            }

            FlowFile flowFile = session.create();
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
            callback.key = key;
            try {
                flowFile = session.write(flowFile, callback);
                flowFiles.add(flowFile);
            } catch (ProcessException e) {
                LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e);
                session.remove(flowFile);
            }
            key.clear();
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }

    return flowFiles;
}

From source file:org.apache.nifi.processors.hadoop.ValueReader.java

License:Apache License

@Override
public Set<FlowFile> readSequenceFile(final Path file, Configuration configuration, FileSystem fileSystem)
        throws IOException {

    Set<FlowFile> flowFiles = new HashSet<>();
    final SequenceFile.Reader reader = new SequenceFile.Reader(configuration,
            Reader.file(fileSystem.makeQualified(file)));
    final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
    int counter = 0;
    LOG.debug("Reading from sequence file {}", new Object[] { file });
    final OutputStreamWritableCallback writer = new OutputStreamWritableCallback(reader);
    Text key = new Text();
    try {//from   ww  w  .ja v  a 2 s . co  m
        while (reader.next(key)) {
            String fileName = key.toString();
            // the key may be a file name, and may not
            if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
                if (fileName.contains(File.separator)) {
                    fileName = StringUtils.substringAfterLast(fileName, File.separator);
                }
                fileName = fileName + "." + System.nanoTime();
            } else {
                fileName = inputfileName + ++counter;
            }
            FlowFile flowFile = session.create();
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
            try {
                flowFile = session.write(flowFile, writer);
                flowFiles.add(flowFile);
            } catch (ProcessException e) {
                LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e);
                session.remove(flowFile);
            }
            key.clear();
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }

    return flowFiles;
}