Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static double readSigmaJSigmaK(FileSystem fs, Path pathPattern, Configuration conf) throws IOException {
    Map<String, Double> weightSum = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            if (weightSum.size() > 1) {
                throw new IOException("Incorrect Sum File");
            } else if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) {
                weightSum.put(BayesConstants.TOTAL_SUM, value.get());
            }/*from  w w  w  . j  a v  a 2s  .  c om*/

        }
    }

    return weightSum.get(BayesConstants.TOTAL_SUM);
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static double readVocabCount(FileSystem fs, Path pathPattern, Configuration conf) throws IOException {
    Map<String, Double> weightSum = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            if (weightSum.size() > 1) {
                throw new IOException("Incorrect vocabCount File");
            }// w w  w  . ja v a  2 s.c  o m
            if (key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) {
                weightSum.put(BayesConstants.FEATURE_SET_SIZE, value.get());
            }

        }
    }

    return weightSum.get(BayesConstants.FEATURE_SET_SIZE);
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver.java

License:Apache License

public static ConfusionMatrix readResult(FileSystem fs, Path pathPattern, Configuration conf, Parameters params)
        throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();
    String defaultLabel = params.get("defaultCat");
    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>();

    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            String correctLabel = key.stringAt(1);
            String classifiedLabel = key.stringAt(2);
            Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel);
            if (rowMatrix == null) {
                rowMatrix = new HashMap<String, Integer>();
            }//  w w  w .  ja  v  a 2s .  co  m
            Integer count = Double.valueOf(value.get()).intValue();
            rowMatrix.put(classifiedLabel, count);
            confusionMatrix.put(correctLabel, rowMatrix);

        }
    }

    ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel);
    for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) {
        Map<String, Integer> rowMatrix = correctLabelSet.getValue();
        for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) {
            matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey());
            matrix.putCount(correctLabelSet.getKey(), classifiedLabelSet.getKey(),
                    classifiedLabelSet.getValue());
        }
    }
    return matrix;

}

From source file:org.apache.mahout.clustering.evaluation.RepresentativePointsDriver.java

License:Apache License

private static void writeInitialState(Path output, Path clustersIn) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    for (FileStatus dir : fs.globStatus(clustersIn)) {
        Path inPath = dir.getPath();
        for (FileStatus part : fs.listStatus(inPath, PathFilters.logsCRCFilter())) {
            Path inPart = part.getPath();
            Path path = new Path(output, inPart.getName());
            SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class,
                    VectorWritable.class);
            try {
                for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(inPart,
                        true, conf)) {//  www . j  av  a 2  s.c  o m
                    Cluster cluster = clusterWritable.getValue();
                    if (log.isDebugEnabled()) {
                        log.debug("C-{}: {}", cluster.getId(),
                                AbstractCluster.formatVector(cluster.getCenter(), null));
                    }
                    writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter()));
                }
            } finally {
                Closeables.close(writer, false);
            }
        }
    }
}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

static LDAState createState(Configuration job) throws IOException {
    String statePath = job.get(STATE_IN_KEY);
    int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY));
    int numWords = Integer.parseInt(job.get(NUM_WORDS_KEY));
    double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY));

    Path dir = new Path(statePath);
    FileSystem fs = dir.getFileSystem(job);

    DenseMatrix pWgT = new DenseMatrix(numTopics, numWords);
    double[] logTotals = new double[numTopics];
    double ll = 0.0;

    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = status.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
        while (reader.next(key, value)) {
            int topic = key.getFirst();
            int word = key.getSecond();
            if (word == TOPIC_SUM_KEY) {
                logTotals[topic] = value.get();
                if (Double.isInfinite(value.get())) {
                    throw new IllegalArgumentException();
                }//from ww  w.j a  v a 2 s . c o m
            } else if (topic == LOG_LIKELIHOOD_KEY) {
                ll = value.get();
            } else {
                if (!((topic >= 0) && (word >= 0))) {
                    throw new IllegalArgumentException(topic + " " + word);
                }
                if (pWgT.getQuick(topic, word) != 0.0) {
                    throw new IllegalArgumentException();
                }
                pWgT.setQuick(topic, word, value.get());
                if (Double.isInfinite(pWgT.getQuick(topic, word))) {
                    throw new IllegalArgumentException();
                }
            }
        }
        reader.close();
    }

    return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, ll);
}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

private double findLL(Path statePath, Configuration job) throws IOException {
    FileSystem fs = statePath.getFileSystem(job);

    double ll = 0.0;

    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) {
        Path path = status.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
        while (reader.next(key, value)) {
            if (key.getFirst() == LOG_LIKELIHOOD_KEY) {
                ll = value.get();/*from ww  w  . ja va 2s . c  o  m*/
                break;
            }
        }
        reader.close();
    }

    return ll;
}

From source file:org.apache.mahout.common.HadoopUtil.java

License:Apache License

public static FileStatus[] getFileStatus(Path path, PathType pathType, PathFilter filter,
        Comparator<FileStatus> ordering, Configuration conf) throws IOException {
    FileStatus[] statuses;//www . j a v  a  2s  .  c om
    FileSystem fs = path.getFileSystem(conf);
    if (filter == null) {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path);
    } else {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter);
    }
    if (ordering != null) {
        Arrays.sort(statuses, ordering);
    }
    return statuses;
}

From source file:org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator.java

License:Apache License

/**
 * Constructor that uses either {@link FileSystem#listStatus(Path)} or
 * {@link FileSystem#globStatus(Path)} to obtain list of files to iterate over
 * (depending on pathType parameter).//  www  .  j a v a2 s  . c om
 */
public SequenceFileDirValueIterator(Path path, PathType pathType, PathFilter filter,
        Comparator<FileStatus> ordering, boolean reuseKeyValueInstances, Configuration conf)
        throws IOException {
    FileStatus[] statuses;
    FileSystem fs = FileSystem.get(path.toUri(), conf);
    if (filter == null) {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path);
    } else {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter);
    }
    iterators = Lists.newArrayList();
    init(statuses, ordering, reuseKeyValueInstances, conf);
}

From source file:org.apache.mahout.fpm.bigfim.BigFIMDriver.java

License:Apache License

private static void startMining(String outputDir, Config config)
        throws IOException, ClassNotFoundException, InterruptedException {
    String inputFilesDir = outputDir + separator + "pg" + separator;
    String outputFile = outputDir + separator + OFis;
    System.out.println("[StartMining]: input: " + inputFilesDir + ", output: " + outputFile);

    Configuration conf = new Configuration();
    setConfigurationValues(conf, config);

    Job job = new Job(conf, "Start Mining");
    job.setJarByClass(BigFIMDriver.class);

    job.setOutputKeyClass(Text.class);

    if (config.getWriteSets()) {
        job.setOutputValueClass(Text.class);
        job.setMapperClass(EclatMinerMapper.class);
        job.setReducerClass(EclatMinerReducer.class);
    } else {/* ww w.j  a  v a 2  s . c o m*/
        job.setOutputValueClass(LongWritable.class);
        job.setMapperClass(EclatMinerMapperSetCount.class);
        job.setReducerClass(EclatMinerReducerSetCount.class);
    }

    job.setInputFormatClass(NoSplitSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    List<Path> inputPaths = new ArrayList<Path>();

    FileSystem fs = FileSystem.get(conf);
    FileStatus[] listStatus = fs.globStatus(new Path(inputFilesDir + "bucket*"));
    fs.close();
    for (FileStatus fstat : listStatus) {
        inputPaths.add(fstat.getPath());
    }

    FileInputFormat.setInputPaths(job, inputPaths.toArray(new Path[inputPaths.size()]));
    FileOutputFormat.setOutputPath(job, new Path(outputFile));

    long start = System.currentTimeMillis();
    job.waitForCompletion(true);
    long end = System.currentTimeMillis();
    System.out.println("Job Mining took " + (end - start) / 1000 + "s");
}

From source file:org.apache.mahout.fpm.disteclat.DistEclatDriver.java

License:Apache License

/**
 * Starts the third MapReduce cycle. Each mapper reads the prefix groups assigned to it and computes the collection of
 * closed sets. All information is reported to the reducer which finally writes the output to disk.
 * //from  ww  w .  j  a v a 2  s.c om
 * 
 * @param inputDir
 * @param outputDir
 * @param config
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 * @throws URISyntaxException
 */
private static void startMining(String inputDir, String outputDir, Config config)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {

    String inputFilesDir = inputDir;
    String outputFile = outputDir + separator + OFis;
    System.out.println("[StartMining]: input: " + inputFilesDir + ", output: " + outputFile);

    Configuration conf = new Configuration();
    setConfigurationValues(conf, config);

    Job job = new Job(conf, "Start Mining");
    job.setJarByClass(DistEclatDriver.class);

    job.setOutputKeyClass(Text.class);

    if (config.getWriteSets()) {
        job.setOutputValueClass(Text.class);
        job.setMapperClass(EclatMinerMapper.class);
        job.setReducerClass(EclatMinerReducer.class);
    } else {
        job.setOutputValueClass(LongWritable.class);
        job.setMapperClass(EclatMinerMapperSetCount.class);
        job.setReducerClass(EclatMinerReducerSetCount.class);
    }

    job.setInputFormatClass(NoSplitSequenceFileInputFormat.class);

    List<Path> inputPaths = new ArrayList<Path>();

    FileSystem fs = FileSystem.get(conf);
    FileStatus[] listStatus = fs.globStatus(new Path(inputFilesDir + "bucket*"));
    for (FileStatus fstat : listStatus) {
        inputPaths.add(fstat.getPath());
    }

    FileInputFormat.setInputPaths(job, inputPaths.toArray(new Path[inputPaths.size()]));
    FileOutputFormat.setOutputPath(job, new Path(outputFile));

    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    long start = System.currentTimeMillis();
    job.waitForCompletion(true);
    long end = System.currentTimeMillis();
    System.out.println("[Mining]: Took " + (end - start) / 1000 + "s");
    fs.close();
}