Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java

License:Apache License

/**
 * read the feature frequency List which is built at the end of the Parallel counting job
 * /*from  ww  w.  j av  a  2s . c  om*/
 * @param params
 * @return Feature Frequency List
 * @throws IOException
 */
public static List<Pair<String, Long>> readFList(Parameters params) throws IOException {
    Writable key = new Text();
    LongWritable value = new LongWritable();
    int minSupport = Integer.valueOf(params.get("minSupport", "3"));
    Configuration conf = new Configuration();

    Path parallelCountingPath = new Path(params.get("output"), "parallelcounting");
    FileSystem fs = FileSystem.get(parallelCountingPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(parallelCountingPath, "part-*"));

    PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
            new Comparator<Pair<String, Long>>() {

                @Override
                public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                    int ret = o2.getSecond().compareTo(o1.getSecond());
                    if (ret != 0) {
                        return ret;
                    }
                    return o1.getFirst().compareTo(o2.getFirst());
                }

            });
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // key is feature value is count
        while (reader.next(key, value)) {
            if (value.get() >= minSupport) {
                queue.add(new Pair<String, Long>(key.toString(), value.get()));
            }
        }
    }
    List<Pair<String, Long>> fList = new ArrayList<Pair<String, Long>>();
    while (!queue.isEmpty()) {
        fList.add(queue.poll());
    }
    return fList;
}

From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java

License:Apache License

/**
 * Read the Frequent Patterns generated from Text
 * /* w w w .j a v  a  2 s.  c o m*/
 * @param params
 * @return List of TopK patterns for each string frequent feature
 * @throws IOException
 */
public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Parameters params) throws IOException {

    Configuration conf = new Configuration();

    Path frequentPatternsPath = new Path(params.get("output"), "frequentPatterns");
    FileSystem fs = FileSystem.get(frequentPatternsPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(frequentPatternsPath, "part-*"));

    List<Pair<String, TopKStringPatterns>> ret = new ArrayList<Pair<String, TopKStringPatterns>>();
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        ret.addAll(FPGrowth.readFrequentPattern(fs, conf, path));
    }
    return ret;
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtDenseOutJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path xiPath, Path sqPath,
        Path sbPath, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight,
        int numReduceTasks, boolean broadcastBInput)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    Job job = new Job(oldApiJob);
    job.setJobName("ABt-job");
    job.setJarByClass(ABtDenseOutJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputAPaths);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }/* www  .  jav  a 2  s . c  o m*/

    FileOutputFormat.setOutputPath(job, outputPath);

    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(SplitPartitionedWritable.class);
    job.setMapOutputValueClass(DenseBlockWritable.class);

    job.setOutputKeyClass(SplitPartitionedWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(ABtMapper.class);
    job.setReducerClass(QRReducer.class);

    job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows);
    job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight);
    job.getConfiguration().setInt(QRFirstStep.PROP_K, k);
    job.getConfiguration().setInt(QRFirstStep.PROP_P, p);
    job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString());

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
        job.getConfiguration().set(PROP_SB_PATH, sbPath.toString());
        job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString());
    }

    job.setNumReduceTasks(numReduceTasks);

    // broadcast Bt files if required.
    if (broadcastBInput) {
        job.getConfiguration().set(PROP_BT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(inputBtGlob);
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration());
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("ABt job unsuccessful.");
    }

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path outputPath,
        int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks,
        boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    // MultipleOutputs
    // .addNamedOutput(oldApiJob,
    // QJob.OUTPUT_QHAT,
    // org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
    // SplitPartitionedWritable.class,
    // DenseBlockWritable.class);
    ///*w  w  w. j ava 2  s .  co m*/
    // MultipleOutputs
    // .addNamedOutput(oldApiJob,
    // QJob.OUTPUT_RHAT,
    // org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
    // SplitPartitionedWritable.class,
    // VectorWritable.class);

    Job job = new Job(oldApiJob);
    job.setJobName("ABt-job");
    job.setJarByClass(ABtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputAPaths);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }

    FileOutputFormat.setOutputPath(job, outputPath);

    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(SplitPartitionedWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(SplitPartitionedWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(ABtMapper.class);
    job.setCombinerClass(BtJob.OuterProductCombiner.class);
    job.setReducerClass(QRReducer.class);

    job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows);
    job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight);
    job.getConfiguration().setInt(QRFirstStep.PROP_K, k);
    job.getConfiguration().setInt(QRFirstStep.PROP_P, p);
    job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString());

    // number of reduce tasks doesn't matter. we don't actually
    // send anything to reducers.

    job.setNumReduceTasks(numReduceTasks);

    // broadcast Bt files if required.
    if (broadcastBInput) {
        job.getConfiguration().set(PROP_BT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(inputBtGlob);
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), conf);
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("ABt job unsuccessful.");
    }

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath,
        int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast,
        Class<? extends Writable> labelClass, boolean outputBBtProducts)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
            labelClass, VectorWritable.class);

    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        /*//from  ww w.  j  a  v a2 s  .  co  m
         * MAHOUT-1067: if we are asked to output BBT products then named vector
         * names should be propagated to Q too so that UJob could pick them up
         * from there.
         */
        oldApiJob.setBoolean(PROP_NV, true);
    }
    if (xiPath != null) {
        // compute pca -related stuff as well
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }

    /*
     * HACK: we use old api multiple outputs since they are not available in the
     * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we
     * can use new api interfaces.
     */

    Job job = new Job(oldApiJob);
    job.setJobName("Bt-job");
    job.setJarByClass(BtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathA);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // WARN: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT);

    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    job.getConfiguration().setInt(QJob.PROP_K, k);
    job.getConfiguration().setInt(QJob.PROP_P, p);
    job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString());
    job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight);

    job.setNumReduceTasks(numReduceTasks);

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
    }

    /*
     * we can broadhast Rhat files since all of them are reuqired by each job,
     * but not Q files which correspond to splits of A (so each split of A will
     * require only particular Q file, each time different one).
     */

    if (broadcast) {
        job.getConfiguration().set(PROP_RHAT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*"));
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration());
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("Bt job unsuccessful.");
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.SSVDHelper.java

License:Apache License

/**
 * sniff label type in the input files//from  w ww . j a va  2  s  .  c om
 */
static Class<? extends Writable> sniffInputLabelType(Path[] inputPath, Configuration conf) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    for (Path p : inputPath) {
        FileStatus[] fstats = fs.globStatus(p);
        if (fstats == null || fstats.length == 0) {
            continue;
        }

        FileStatus firstSeqFile;
        if (fstats[0].isDir()) {
            firstSeqFile = fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0];
        } else {
            firstSeqFile = fstats[0];
        }

        SequenceFile.Reader r = null;
        try {
            r = new SequenceFile.Reader(fs, firstSeqFile.getPath(), conf);
            return r.getKeyClass().asSubclass(Writable.class);
        } finally {
            Closeables.close(r, true);
        }
    }
    throw new IOException("Unable to open input files to determine input label type.");
}

From source file:org.apache.mahout.utils.ConcatenateVectorsJob.java

License:Apache License

private Class<? extends Writable> getKeyClass(Path path, FileSystem fs) throws IOException {
    // this works for both part* and a directory/ with part*.
    Path pathPattern = new Path(path, "part*");
    FileStatus[] paths = fs.globStatus(pathPattern);
    Preconditions.checkArgument(paths.length > 0, path.getName() + " is a file, should be a directory");

    Path file = paths[0].getPath();
    SequenceFile.Reader reader = null;
    try {//from   w  w w  .  j  a v  a2 s. c  om
        reader = new SequenceFile.Reader(fs, file, fs.getConf());
        return reader.getKeyClass().asSubclass(Writable.class);
    } finally {
        Closeables.close(reader, true);
    }
}

From source file:org.apache.mahout.utils.eval.InMemoryFactorizationEvaluator.java

License:Apache License

private Matrix readMatrix(Path dir) throws IOException {

    Matrix matrix = new SparseMatrix(new int[] { Integer.MAX_VALUE, Integer.MAX_VALUE });

    FileSystem fs = dir.getFileSystem(getConf());
    for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = seqFile.getPath();
        SequenceFile.Reader reader = null;
        try {//  w w  w. ja  v a2 s  . c o m
            reader = new SequenceFile.Reader(fs, path, getConf());
            IntWritable key = new IntWritable();
            VectorWritable value = new VectorWritable();
            while (reader.next(key, value)) {
                int row = key.get();
                Iterator<Vector.Element> elementsIterator = value.get().iterateNonZero();
                while (elementsIterator.hasNext()) {
                    Vector.Element element = elementsIterator.next();
                    matrix.set(row, element.index(), element.get());
                }
            }
        } finally {
            IOUtils.quietClose(reader);
        }
    }
    return matrix;
}

From source file:org.apache.mahout.utils.eval.InMemoryFactorizationEvaluator.java

License:Apache License

private List<Preference> readProbePreferences(Path dir) throws IOException {

    List<Preference> preferences = new LinkedList<Preference>();
    FileSystem fs = dir.getFileSystem(getConf());
    for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = seqFile.getPath();
        InputStream in = null;// ww  w  . j a va  2s  .  co m
        try {
            in = fs.open(path);
            BufferedReader reader = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8")));
            String line;
            while ((line = reader.readLine()) != null) {
                String[] tokens = TasteHadoopUtils.splitPrefTokens(line);
                long userID = Long.parseLong(tokens[0]);
                long itemID = Long.parseLong(tokens[1]);
                float value = Float.parseFloat(tokens[2]);
                preferences.add(new GenericPreference(userID, itemID, value));
            }
        } finally {
            IOUtils.quietClose(in);
        }
    }
    return preferences;
}

From source file:org.apache.mahout.utils.vectors.text.DictionaryVectorizer.java

License:Apache License

/**
 * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
 * This will use constant memory and will run at the speed of your disk read
 * //www  .  ja  v a 2  s  .com
 * @param minSupport
 * @param wordCountPath
 * @param dictionaryPathBase
 * @throws IOException
 */
private static List<Path> createDictionaryChunks(int minSupport, Path wordCountPath, Path dictionaryPathBase,
        int chunkSizeInMegabytes, Writable value, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = new ArrayList<Path>();

    Writable key = new Text();
    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(wordCountPath, OUTPUT_FILES_PATTERN));

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
            IntWritable.class);

    long currentChunkSize = 0;

    int i = 0;
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // key is feature value is count
        while (reader.next(key, value)) {
            if (currentChunkSize > chunkSizeLimit) {
                dictWriter.close();
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
            currentChunkSize += fieldSize;
            dictWriter.append(key, new IntWritable(i++));
        }
    }
    maxTermDimension[0] = i;
    dictWriter.close();

    return chunkPaths;
}