Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java

License:Apache License

/**
 * read the feature frequency List which is built at the end of the Parallel counting job
 * /*from  ww  w.  j av  a  2s . c  om*/
 * @param params
 * @return Feature Frequency List
 * @throws IOException
 */
public static List<Pair<String, Long>> readFList(Parameters params) throws IOException {
    Writable key = new Text();
    LongWritable value = new LongWritable();
    int minSupport = Integer.valueOf(params.get("minSupport", "3"));
    Configuration conf = new Configuration();

    Path parallelCountingPath = new Path(params.get("output"), "parallelcounting");
    FileSystem fs = FileSystem.get(parallelCountingPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(parallelCountingPath, "part-*"));

    PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
            new Comparator<Pair<String, Long>>() {

                @Override
                public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                    int ret = o2.getSecond().compareTo(o1.getSecond());
                    if (ret != 0) {
                        return ret;
                    }
                    return o1.getFirst().compareTo(o2.getFirst());
                }

            });
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // key is feature value is count
        while (reader.next(key, value)) {
            if (value.get() >= minSupport) {
                queue.add(new Pair<String, Long>(key.toString(), value.get()));
            }
        }
    }
    List<Pair<String, Long>> fList = new ArrayList<Pair<String, Long>>();
    while (!queue.isEmpty()) {
        fList.add(queue.poll());
    }
    return fList;
}

From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java

License:Apache License

/**
 * Read the Frequent Patterns generated from Text
 * /* w w w .j a v  a  2 s.  c o m*/
 * @param params
 * @return List of TopK patterns for each string frequent feature
 * @throws IOException
 */
public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Parameters params) throws IOException {

    Configuration conf = new Configuration();

    Path frequentPatternsPath = new Path(params.get("output"), "frequentPatterns");
    FileSystem fs = FileSystem.get(frequentPatternsPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(frequentPatternsPath, "part-*"));

    List<Pair<String, TopKStringPatterns>> ret = new ArrayList<Pair<String, TopKStringPatterns>>();
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        ret.addAll(FPGrowth.readFrequentPattern(fs, conf, path));
    }
    return ret;
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtDenseOutJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path xiPath, Path sqPath,
        Path sbPath, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight,
        int numReduceTasks, boolean broadcastBInput)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    Job job = new Job(oldApiJob);
    job.setJobName("ABt-job");
    job.setJarByClass(ABtDenseOutJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputAPaths);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }/* www  .  jav  a 2  s . c  o m*/

    FileOutputFormat.setOutputPath(job, outputPath);

    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(SplitPartitionedWritable.class);
    job.setMapOutputValueClass(DenseBlockWritable.class);

    job.setOutputKeyClass(SplitPartitionedWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(ABtMapper.class);
    job.setReducerClass(QRReducer.class);

    job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows);
    job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight);
    job.getConfiguration().setInt(QRFirstStep.PROP_K, k);
    job.getConfiguration().setInt(QRFirstStep.PROP_P, p);
    job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString());

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
        job.getConfiguration().set(PROP_SB_PATH, sbPath.toString());
        job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString());
    }

    job.setNumReduceTasks(numReduceTasks);

    // broadcast Bt files if required.
    if (broadcastBInput) {
        job.getConfiguration().set(PROP_BT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(inputBtGlob);
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration());
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("ABt job unsuccessful.");
    }

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path outputPath,
        int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks,
        boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    // MultipleOutputs
    // .addNamedOutput(oldApiJob,
    // QJob.OUTPUT_QHAT,
    // org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
    // SplitPartitionedWritable.class,
    // DenseBlockWritable.class);
    ///*w  w  w. j ava 2  s .  co m*/
    // MultipleOutputs
    // .addNamedOutput(oldApiJob,
    // QJob.OUTPUT_RHAT,
    // org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
    // SplitPartitionedWritable.class,
    // VectorWritable.class);

    Job job = new Job(oldApiJob);
    job.setJobName("ABt-job");
    job.setJarByClass(ABtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputAPaths);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }

    FileOutputFormat.setOutputPath(job, outputPath);

    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(SplitPartitionedWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(SplitPartitionedWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(ABtMapper.class);
    job.setCombinerClass(BtJob.OuterProductCombiner.class);
    job.setReducerClass(QRReducer.class);

    job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows);
    job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight);
    job.getConfiguration().setInt(QRFirstStep.PROP_K, k);
    job.getConfiguration().setInt(QRFirstStep.PROP_P, p);
    job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString());

    // number of reduce tasks doesn't matter. we don't actually
    // send anything to reducers.

    job.setNumReduceTasks(numReduceTasks);

    // broadcast Bt files if required.
    if (broadcastBInput) {
        job.getConfiguration().set(PROP_BT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(inputBtGlob);
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), conf);
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("ABt job unsuccessful.");
    }

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath,
        int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast,
        Class<? extends Writable> labelClass, boolean outputBBtProducts)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
            labelClass, VectorWritable.class);

    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        /*//from  ww w.  j  a  v a2 s  .  co  m
         * MAHOUT-1067: if we are asked to output BBT products then named vector
         * names should be propagated to Q too so that UJob could pick them up
         * from there.
         */
        oldApiJob.setBoolean(PROP_NV, true);
    }
    if (xiPath != null) {
        // compute pca -related stuff as well
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }

    /*
     * HACK: we use old api multiple outputs since they are not available in the
     * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we
     * can use new api interfaces.
     */

    Job job = new Job(oldApiJob);
    job.setJobName("Bt-job");
    job.setJarByClass(BtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathA);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // WARN: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT);

    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    job.getConfiguration().setInt(QJob.PROP_K, k);
    job.getConfiguration().setInt(QJob.PROP_P, p);
    job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString());
    job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight);

    job.setNumReduceTasks(numReduceTasks);

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
    }

    /*
     * we can broadhast Rhat files since all of them are reuqired by each job,
     * but not Q files which correspond to splits of A (so each split of A will
     * require only particular Q file, each time different one).
     */

    if (broadcast) {
        job.getConfiguration().set(PROP_RHAT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*"));
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration());
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("Bt job unsuccessful.");
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.SSVDHelper.java

License:Apache License

/**
 * sniff label type in the input files//from  w ww . j a va  2  s  .  c om
 */
static Class<? extends Writable> sniffInputLabelType(Path[] inputPath, Configuration conf) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    for (Path p : inputPath) {
        FileStatus[] fstats = fs.globStatus(p);
        if (fstats == null || fstats.length == 0) {
            continue;
        }

        FileStatus firstSeqFile;
        if (fstats[0].isDir()) {
            firstSeqFile = fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0];
        } else {
            firstSeqFile = fstats[0];
        }

        SequenceFile.Reader r = null;
        try {
            r = new SequenceFile.Reader(fs, firstSeqFile.getPath(), conf);
            return r.getKeyClass().asSubclass(Writable.class);
        } finally {
            Closeables.close(r, true);
        }
    }
    throw new IOException("Unable to open input files to determine input label type.");
}

From source file:org.apache.mahout.utils.ConcatenateVectorsJob.java

License:Apache License

private Class<? extends Writable> getKeyClass(Path path, FileSystem fs) throws IOException {
    // this works for both part* and a directory/ with part*.
    Path pathPattern = new Path(path, "part*");
    FileStatus[] paths = fs.globStatus(pathPattern);
    Preconditions.checkArgument(paths.length > 0, path.getName() + " is a file, should be a directory");

    Path file = paths[0].getPath();
    SequenceFile.Reader reader = null;
    try {//from   w  w w  .  j  a v  a2 s. c  om
        reader = new SequenceFile.Reader(fs, file, fs.getConf());
        return reader.getKeyClass().asSubclass(Writable.class);
    } finally {
        Closeables.close(reader, true);
    }
}

From source file:org.apache.mahout.utils.eval.InMemoryFactorizationEvaluator.java

License:Apache License

private Matrix readMatrix(Path dir) throws IOException {

    Matrix matrix = new SparseMatrix(new int[] { Integer.MAX_VALUE, Integer.MAX_VALUE });

    FileSystem fs = dir.getFileSystem(getConf());
    for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = seqFile.getPath();
        SequenceFile.Reader reader = null;
        try {//  w w  w. ja  v a2 s  . c o m
            reader = new SequenceFile.Reader(fs, path, getConf());
            IntWritable key = new IntWritable();
            VectorWritable value = new VectorWritable();
            while (reader.next(key, value)) {
                int row = key.get();
                Iterator<Vector.Element> elementsIterator = value.get().iterateNonZero();
                while (elementsIterator.hasNext()) {
                    Vector.Element element = elementsIterator.next();
                    matrix.set(row, element.index(), element.get());
                }
            }
        } finally {
            IOUtils.quietClose(reader);
        }
    }
    return matrix;
}

From source file:org.apache.mahout.utils.eval.InMemoryFactorizationEvaluator.java

License:Apache License

private List<Preference> readProbePreferences(Path dir) throws IOException {

    List<Preference> preferences = new LinkedList<Preference>();
    FileSystem fs = dir.getFileSystem(getConf());
    for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = seqFile.getPath();
        InputStream in = null;// ww  w  . j a va  2s  .  co m
        try {
            in = fs.open(path);
            BufferedReader reader = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8")));
            String line;
            while ((line = reader.readLine()) != null) {
                String[] tokens = TasteHadoopUtils.splitPrefTokens(line);
                long userID = Long.parseLong(tokens[0]);
                long itemID = Long.parseLong(tokens[1]);
                float value = Float.parseFloat(tokens[2]);
                preferences.add(new GenericPreference(userID, itemID, value));
            }
        } finally {
            IOUtils.quietClose(in);
        }
    }
    return preferences;
}

From source file:org.apache.mahout.utils.vectors.text.DictionaryVectorizer.java

License:Apache License

/**
 * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
 * This will use constant memory and will run at the speed of your disk read
 * //www  .  ja  v a 2  s  .com
 * @param minSupport
 * @param wordCountPath
 * @param dictionaryPathBase
 * @throws IOException
 */
private static List<Path> createDictionaryChunks(int minSupport, Path wordCountPath, Path dictionaryPathBase,
        int chunkSizeInMegabytes, Writable value, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = new ArrayList<Path>();

    Writable key = new Text();
    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(wordCountPath, OUTPUT_FILES_PATTERN));

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
            IntWritable.class);

    long currentChunkSize = 0;

    int i = 0;
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // key is feature value is count
        while (reader.next(key, value)) {
            if (currentChunkSize > chunkSizeLimit) {
                dictWriter.close();
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
            currentChunkSize += fieldSize;
            dictWriter.append(key, new IntWritable(i++));
        }
    }
    maxTermDimension[0] = i;
    dictWriter.close();

    return chunkPaths;
}