Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.lens.server.util.ScannedPaths.java

License:Apache License

/**
 * Method that computes path of resources matching the input path or path regex pattern.
 * If provided path is a directory it additionally checks for the jar_order or glob_order file
 * that imposes ordering of resources and filters out other resources.
 *
 * Updates finalPaths List with matched paths and returns an iterator for matched paths.
 *//*from  www. java  2  s  .c  om*/
private List<String> getMatchedPaths(Path pt, String type) {
    List<String> finalPaths = new ArrayList<>();
    InputStream resourceOrderIStream = null;
    FileSystem fs;

    try {
        fs = pt.getFileSystem(new Configuration());
        if (fs.exists(pt)) {
            if (fs.isFile(pt)) {
                /**
                 * CASE 1 : Direct FILE provided in path
                 **/
                finalPaths.add(pt.toUri().toString());
            } else if (fs.isDirectory(pt)) {
                /**
                 * CASE 2 : DIR provided in path
                 **/
                Path resourceOrderFile;
                FileStatus[] statuses;
                List<String> newMatches;
                List<String> resources;

                resourceOrderFile = new Path(pt, "jar_order");
                /** Add everything in dir if no jar_order or glob_order is present **/
                if (!fs.exists(resourceOrderFile)) {
                    resourceOrderFile = new Path(pt, "glob_order");
                    if (!fs.exists(resourceOrderFile)) {
                        resourceOrderFile = null;
                        /** Get matched resources recursively for all files **/
                        statuses = fs.globStatus(new Path(pt, "*"));
                        if (statuses != null) {
                            for (FileStatus st : statuses) {
                                newMatches = getMatchedPaths(st.getPath(), type);
                                finalPaths.addAll(newMatches);
                            }
                        }
                    }
                }
                if (resourceOrderFile != null) {
                    /** Else get jars as per order specified in jar_order/glob_order **/
                    resourceOrderIStream = fs.open(resourceOrderFile);
                    resources = IOUtils.readLines(resourceOrderIStream, Charset.forName("UTF-8"));
                    for (String resource : resources) {
                        if (StringUtils.isBlank(resource)) {
                            continue;
                        }
                        resource = resource.trim();

                        /** Get matched resources recursively for provided path/pattern **/
                        if (resource.startsWith("/") || resource.contains(":/")) {
                            newMatches = getMatchedPaths(new Path(resource), type);
                        } else {
                            newMatches = getMatchedPaths(new Path(pt, resource), type);
                        }
                        finalPaths.addAll(newMatches);
                    }
                }
            }
        } else {
            /**
             * CASE 3 : REGEX provided in path
             * */
            FileStatus[] statuses = fs.globStatus(Path.getPathWithoutSchemeAndAuthority(pt));
            if (statuses != null) {
                for (FileStatus st : statuses) {
                    List<String> newMatches = getMatchedPaths(st.getPath(), type);
                    finalPaths.addAll(newMatches);
                }
            }
        }
        filterDirsAndJarType(fs, finalPaths);
    } catch (FileNotFoundException fex) {
        log.error("File not found while scanning path. Path: {}, Type: {}", path, type, fex);
    } catch (Exception e) {
        log.error("Exception while initializing PathScanner. Path: {}, Type: {}", path, type, e);
    } finally {
        IOUtils.closeQuietly(resourceOrderIStream);
    }

    return finalPaths;
}

From source file:org.apache.mahout.cf.taste.hadoop.als.eval.InMemoryFactorizationEvaluator.java

License:Apache License

private Matrix readMatrix(Path dir) throws IOException {

    Matrix matrix = new SparseMatrix(new int[] { Integer.MAX_VALUE, Integer.MAX_VALUE });

    FileSystem fs = dir.getFileSystem(getConf());
    for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = seqFile.getPath();
        SequenceFile.Reader reader = null;
        try {//w w  w.j  av a2s  . c  o  m
            reader = new SequenceFile.Reader(fs, path, getConf());
            IntWritable key = new IntWritable();
            VectorWritable value = new VectorWritable();
            while (reader.next(key, value)) {
                int row = key.get();
                Iterator<Vector.Element> elementsIterator = value.get().iterateNonZero();
                while (elementsIterator.hasNext()) {
                    Vector.Element element = elementsIterator.next();
                    matrix.set(row, element.index(), element.get());
                }
            }
        } finally {
            Closeables.closeQuietly(reader);
        }
    }
    return matrix;
}

From source file:org.apache.mahout.cf.taste.hadoop.als.eval.InMemoryFactorizationEvaluator.java

License:Apache License

private List<Preference> readProbePreferences(Path dir) throws IOException {

    List<Preference> preferences = new LinkedList<Preference>();
    FileSystem fs = dir.getFileSystem(getConf());
    for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = seqFile.getPath();
        InputStream in = null;/*  w  ww  .  j a va  2  s .  co m*/
        try {
            in = fs.open(path);
            BufferedReader reader = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8")));
            String line;
            while ((line = reader.readLine()) != null) {
                String[] tokens = TasteHadoopUtils.splitPrefTokens(line);
                long userID = Long.parseLong(tokens[0]);
                long itemID = Long.parseLong(tokens[1]);
                float value = Float.parseFloat(tokens[2]);
                preferences.add(new GenericPreference(userID, itemID, value));
            }
        } finally {
            Closeables.closeQuietly(in);
        }
    }
    return preferences;
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static void loadWeightMatrix(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern,
        Configuration conf) throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        log.info("{}", path);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);

        // the key is label,feature
        while (reader.next(key, value)) {

            datastore.loadFeatureWeight(key.stringAt(2), key.stringAt(1), value.get());

        }/*from w  ww  . j av a  2  s. c  o  m*/
    }
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static void loadFeatureWeights(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern,
        Configuration conf) throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        log.info("{}", path);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);

        // the key is either _label_ or label,feature
        long count = 0;
        while (reader.next(key, value)) {
            // Sum of weights for a Feature
            if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) {
                datastore.setSumFeatureWeight(key.stringAt(1), value.get());
                count++;// ww  w . j a va 2  s.c  o  m
                if (count % 50000 == 0) {
                    log.info("Read {} feature weights", count);
                }
            }
        }
    }
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static void loadLabelWeights(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern,
        Configuration conf) throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        log.info("{}", path);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);

        long count = 0;
        while (reader.next(key, value)) {
            // Sum of weights in a Label
            if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) {
                datastore.setSumLabelWeight(key.stringAt(1), value.get());
                count++;/*w  ww .ja  va  2s.c  o m*/
                if (count % 10000 == 0) {
                    log.info("Read {} label weights", count);
                }
            }
        }
    }
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static void loadThetaNormalizer(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern,
        Configuration conf) throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        log.info("{}", path);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);

        long count = 0;
        while (reader.next(key, value)) {
            // Sum of weights in a Label
            if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) {
                datastore.setThetaNormalizer(key.stringAt(1), value.get());
                count++;/* w ww .  java2  s  .co m*/
                if (count % 50000 == 0) {
                    log.info("Read {} theta norms", count);
                }
            }
        }
    }
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static void loadSumWeight(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern,
        Configuration conf) throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        log.info("{}", path);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);

        // the key is _label
        while (reader.next(key, value)) {

            if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) { // Sum of
                // weights for
                // all Features and all Labels
                datastore.setSigmaJSigmaK(value.get());
                log.info("{}", value.get());
            }/*from  w ww .java  2 s  .com*/
        }
    }
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static Map<String, Double> readLabelSums(FileSystem fs, Path pathPattern, Configuration conf)
        throws IOException {
    Map<String, Double> labelSum = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);

    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // the key is either _label_ or label,feature
        while (reader.next(key, value)) {
            if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) { // Sum of counts
                // of labels
                labelSum.put(key.stringAt(1), value.get());
            }//from w ww .  java  2s . c o m

        }
    }

    return labelSum;
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static Map<String, Double> readLabelDocumentCounts(FileSystem fs, Path pathPattern, Configuration conf)
        throws IOException {
    Map<String, Double> labelDocumentCounts = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // the key is either _label_ or label,feature
        while (reader.next(key, value)) {
            // Count of Documents in a Label
            if (key.stringAt(0).equals(BayesConstants.LABEL_COUNT)) {
                labelDocumentCounts.put(key.stringAt(1), value.get());
            }//from  w w  w . j av a  2 s.  c  o m

        }
    }

    return labelDocumentCounts;
}