Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static double readSigmaJSigmaK(FileSystem fs, Path pathPattern, Configuration conf) throws IOException {
    Map<String, Double> weightSum = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            if (weightSum.size() > 1) {
                throw new IOException("Incorrect Sum File");
            } else if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) {
                weightSum.put(BayesConstants.TOTAL_SUM, value.get());
            }/*from  w w  w  . j  a v  a 2s  .  c om*/

        }
    }

    return weightSum.get(BayesConstants.TOTAL_SUM);
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static double readVocabCount(FileSystem fs, Path pathPattern, Configuration conf) throws IOException {
    Map<String, Double> weightSum = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            if (weightSum.size() > 1) {
                throw new IOException("Incorrect vocabCount File");
            }// w w  w  . ja v a  2 s.c  o m
            if (key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) {
                weightSum.put(BayesConstants.FEATURE_SET_SIZE, value.get());
            }

        }
    }

    return weightSum.get(BayesConstants.FEATURE_SET_SIZE);
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver.java

License:Apache License

public static ConfusionMatrix readResult(FileSystem fs, Path pathPattern, Configuration conf, Parameters params)
        throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();
    String defaultLabel = params.get("defaultCat");
    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>();

    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            String correctLabel = key.stringAt(1);
            String classifiedLabel = key.stringAt(2);
            Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel);
            if (rowMatrix == null) {
                rowMatrix = new HashMap<String, Integer>();
            }//  w w  w .  ja  v  a 2s .  co  m
            Integer count = Double.valueOf(value.get()).intValue();
            rowMatrix.put(classifiedLabel, count);
            confusionMatrix.put(correctLabel, rowMatrix);

        }
    }

    ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel);
    for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) {
        Map<String, Integer> rowMatrix = correctLabelSet.getValue();
        for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) {
            matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey());
            matrix.putCount(correctLabelSet.getKey(), classifiedLabelSet.getKey(),
                    classifiedLabelSet.getValue());
        }
    }
    return matrix;

}

From source file:org.apache.mahout.clustering.evaluation.RepresentativePointsDriver.java

License:Apache License

private static void writeInitialState(Path output, Path clustersIn) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    for (FileStatus dir : fs.globStatus(clustersIn)) {
        Path inPath = dir.getPath();
        for (FileStatus part : fs.listStatus(inPath, PathFilters.logsCRCFilter())) {
            Path inPart = part.getPath();
            Path path = new Path(output, inPart.getName());
            SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class,
                    VectorWritable.class);
            try {
                for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(inPart,
                        true, conf)) {//  www . j  av  a 2  s.c  o m
                    Cluster cluster = clusterWritable.getValue();
                    if (log.isDebugEnabled()) {
                        log.debug("C-{}: {}", cluster.getId(),
                                AbstractCluster.formatVector(cluster.getCenter(), null));
                    }
                    writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter()));
                }
            } finally {
                Closeables.close(writer, false);
            }
        }
    }
}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

static LDAState createState(Configuration job) throws IOException {
    String statePath = job.get(STATE_IN_KEY);
    int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY));
    int numWords = Integer.parseInt(job.get(NUM_WORDS_KEY));
    double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY));

    Path dir = new Path(statePath);
    FileSystem fs = dir.getFileSystem(job);

    DenseMatrix pWgT = new DenseMatrix(numTopics, numWords);
    double[] logTotals = new double[numTopics];
    double ll = 0.0;

    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = status.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
        while (reader.next(key, value)) {
            int topic = key.getFirst();
            int word = key.getSecond();
            if (word == TOPIC_SUM_KEY) {
                logTotals[topic] = value.get();
                if (Double.isInfinite(value.get())) {
                    throw new IllegalArgumentException();
                }//from ww  w.j a  v a 2 s . c o m
            } else if (topic == LOG_LIKELIHOOD_KEY) {
                ll = value.get();
            } else {
                if (!((topic >= 0) && (word >= 0))) {
                    throw new IllegalArgumentException(topic + " " + word);
                }
                if (pWgT.getQuick(topic, word) != 0.0) {
                    throw new IllegalArgumentException();
                }
                pWgT.setQuick(topic, word, value.get());
                if (Double.isInfinite(pWgT.getQuick(topic, word))) {
                    throw new IllegalArgumentException();
                }
            }
        }
        reader.close();
    }

    return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, ll);
}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

private double findLL(Path statePath, Configuration job) throws IOException {
    FileSystem fs = statePath.getFileSystem(job);

    double ll = 0.0;

    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) {
        Path path = status.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
        while (reader.next(key, value)) {
            if (key.getFirst() == LOG_LIKELIHOOD_KEY) {
                ll = value.get();/*from ww  w  . ja va 2s . c  o  m*/
                break;
            }
        }
        reader.close();
    }

    return ll;
}

From source file:org.apache.mahout.common.HadoopUtil.java

License:Apache License

public static FileStatus[] getFileStatus(Path path, PathType pathType, PathFilter filter,
        Comparator<FileStatus> ordering, Configuration conf) throws IOException {
    FileStatus[] statuses;//www . j a v  a  2s  .  c om
    FileSystem fs = path.getFileSystem(conf);
    if (filter == null) {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path);
    } else {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter);
    }
    if (ordering != null) {
        Arrays.sort(statuses, ordering);
    }
    return statuses;
}

From source file:org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator.java

License:Apache License

/**
 * Constructor that uses either {@link FileSystem#listStatus(Path)} or
 * {@link FileSystem#globStatus(Path)} to obtain list of files to iterate over
 * (depending on pathType parameter).//  www  .  j a v a2 s  . c om
 */
public SequenceFileDirValueIterator(Path path, PathType pathType, PathFilter filter,
        Comparator<FileStatus> ordering, boolean reuseKeyValueInstances, Configuration conf)
        throws IOException {
    FileStatus[] statuses;
    FileSystem fs = FileSystem.get(path.toUri(), conf);
    if (filter == null) {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path);
    } else {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter);
    }
    iterators = Lists.newArrayList();
    init(statuses, ordering, reuseKeyValueInstances, conf);
}

From source file:org.apache.mahout.fpm.bigfim.BigFIMDriver.java

License:Apache License

private static void startMining(String outputDir, Config config)
        throws IOException, ClassNotFoundException, InterruptedException {
    String inputFilesDir = outputDir + separator + "pg" + separator;
    String outputFile = outputDir + separator + OFis;
    System.out.println("[StartMining]: input: " + inputFilesDir + ", output: " + outputFile);

    Configuration conf = new Configuration();
    setConfigurationValues(conf, config);

    Job job = new Job(conf, "Start Mining");
    job.setJarByClass(BigFIMDriver.class);

    job.setOutputKeyClass(Text.class);

    if (config.getWriteSets()) {
        job.setOutputValueClass(Text.class);
        job.setMapperClass(EclatMinerMapper.class);
        job.setReducerClass(EclatMinerReducer.class);
    } else {/* ww w.j  a  v a 2  s . c o m*/
        job.setOutputValueClass(LongWritable.class);
        job.setMapperClass(EclatMinerMapperSetCount.class);
        job.setReducerClass(EclatMinerReducerSetCount.class);
    }

    job.setInputFormatClass(NoSplitSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    List<Path> inputPaths = new ArrayList<Path>();

    FileSystem fs = FileSystem.get(conf);
    FileStatus[] listStatus = fs.globStatus(new Path(inputFilesDir + "bucket*"));
    fs.close();
    for (FileStatus fstat : listStatus) {
        inputPaths.add(fstat.getPath());
    }

    FileInputFormat.setInputPaths(job, inputPaths.toArray(new Path[inputPaths.size()]));
    FileOutputFormat.setOutputPath(job, new Path(outputFile));

    long start = System.currentTimeMillis();
    job.waitForCompletion(true);
    long end = System.currentTimeMillis();
    System.out.println("Job Mining took " + (end - start) / 1000 + "s");
}

From source file:org.apache.mahout.fpm.disteclat.DistEclatDriver.java

License:Apache License

/**
 * Starts the third MapReduce cycle. Each mapper reads the prefix groups assigned to it and computes the collection of
 * closed sets. All information is reported to the reducer which finally writes the output to disk.
 * //from  ww  w .  j  a v a 2  s.c om
 * 
 * @param inputDir
 * @param outputDir
 * @param config
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 * @throws URISyntaxException
 */
private static void startMining(String inputDir, String outputDir, Config config)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {

    String inputFilesDir = inputDir;
    String outputFile = outputDir + separator + OFis;
    System.out.println("[StartMining]: input: " + inputFilesDir + ", output: " + outputFile);

    Configuration conf = new Configuration();
    setConfigurationValues(conf, config);

    Job job = new Job(conf, "Start Mining");
    job.setJarByClass(DistEclatDriver.class);

    job.setOutputKeyClass(Text.class);

    if (config.getWriteSets()) {
        job.setOutputValueClass(Text.class);
        job.setMapperClass(EclatMinerMapper.class);
        job.setReducerClass(EclatMinerReducer.class);
    } else {
        job.setOutputValueClass(LongWritable.class);
        job.setMapperClass(EclatMinerMapperSetCount.class);
        job.setReducerClass(EclatMinerReducerSetCount.class);
    }

    job.setInputFormatClass(NoSplitSequenceFileInputFormat.class);

    List<Path> inputPaths = new ArrayList<Path>();

    FileSystem fs = FileSystem.get(conf);
    FileStatus[] listStatus = fs.globStatus(new Path(inputFilesDir + "bucket*"));
    for (FileStatus fstat : listStatus) {
        inputPaths.add(fstat.getPath());
    }

    FileInputFormat.setInputPaths(job, inputPaths.toArray(new Path[inputPaths.size()]));
    FileOutputFormat.setOutputPath(job, new Path(outputFile));

    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    long start = System.currentTimeMillis();
    job.waitForCompletion(true);
    long end = System.currentTimeMillis();
    System.out.println("[Mining]: Took " + (end - start) / 1000 + "s");
    fs.close();
}