Example usage for org.apache.hadoop.fs FileUtil stat2Paths

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil stat2Paths.

Prototype

public static Path[] stat2Paths(FileStatus[] stats)

Source Link

Document

convert an array of FileStatus to an array of Path

Usage

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java

License:Apache License

/**
 * Converts the sequence files present in a directory to a {@link HmmModel} model.
 *
 * @param nrOfHiddenStates Number of hidden states
 * @param nrOfOutputStates Number of output states
 * @param modelPath        Location of the sequence files containing the model's distributions
 * @param conf             Configuration object
 * @return HmmModel the encoded model//  w  w  w  .j  a va 2  s . c o  m
 * @throws IOException
 */
public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");

    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());

    for (FileStatus match : matches) {
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == (int) 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }

    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);

    if (model != null) {
        return model;
    } else
        throw new IOException("Error building model from output location");

}

From source file:org.apache.mahout.clustering.classify.ClusterClassificationDriverTest.java

License:Apache License

private void collectVectorsForAssertion() throws IOException {
    Path[] partFilePaths = FileUtil.stat2Paths(fs.globStatus(classifiedOutputPath));
    FileStatus[] listStatus = fs.listStatus(partFilePaths, PathFilters.partFilter());
    for (FileStatus partFile : listStatus) {
        SequenceFile.Reader classifiedVectors = new SequenceFile.Reader(fs, partFile.getPath(), conf);
        Writable clusterIdAsKey = new IntWritable();
        WeightedPropertyVectorWritable point = new WeightedPropertyVectorWritable();
        while (classifiedVectors.next(clusterIdAsKey, point)) {
            collectVector(clusterIdAsKey.toString(), point.getVector());
        }//  w  w w . j a  va  2s .c  o  m
    }
}

From source file:org.apache.mahout.clustering.topdown.postprocessor.ClusterOutputPostProcessorTest.java

License:Apache License

private List<Vector> getVectorsInCluster(Path clusterPath) throws IOException {
    Path[] partFilePaths = FileUtil.stat2Paths(fs.globStatus(clusterPath));
    FileStatus[] listStatus = fs.listStatus(partFilePaths);
    List<Vector> vectors = Lists.newArrayList();
    for (FileStatus partFile : listStatus) {
        SequenceFile.Reader topLevelClusterReader = new SequenceFile.Reader(fs, partFile.getPath(), conf);
        Writable clusterIdAsKey = new LongWritable();
        VectorWritable point = new VectorWritable();
        while (topLevelClusterReader.next(clusterIdAsKey, point)) {
            vectors.add(point.get());//from w ww .j  a v a 2s.c  o  m
        }
    }
    return vectors;
}

From source file:org.apache.mahout.utils.SequenceFileDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*www  .jav a 2 s  .  c om*/
    addOutputOption();
    addOption("substring", "b", "The number of chars to print out per value", false);
    addOption(buildOption("count", "c", "Report the count only", false, false, null));
    addOption("numItems", "n", "Output at most <n> key value pairs", false);
    addOption(
            buildOption("facets", "fa", "Output the counts per key.  Note, if there are a lot of unique keys, "
                    + "this can take up a fair amount of memory", false, false, null));
    addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    Path input = getInputPath();
    FileSystem fs = input.getFileSystem(conf);
    if (fs.getFileStatus(input).isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        pathArr = new Path[1];
        pathArr[0] = input;
    }

    Writer writer;
    boolean shouldClose;
    if (hasOption("output")) {
        shouldClose = true;
        writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        for (Path path : pathArr) {
            if (!hasOption("quiet")) {
                writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
            }

            int sub = Integer.MAX_VALUE;
            if (hasOption("substring")) {
                sub = Integer.parseInt(getOption("substring"));
            }
            boolean countOnly = hasOption("count");
            SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true,
                    conf);
            if (!hasOption("quiet")) {
                writer.append("Key class: ").append(iterator.getKeyClass().toString());
                writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
            }
            OpenObjectIntHashMap<String> facets = null;
            if (hasOption("facets")) {
                facets = new OpenObjectIntHashMap<String>();
            }
            long count = 0;
            if (countOnly) {
                while (iterator.hasNext()) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                writer.append("Count: ").append(String.valueOf(count)).append('\n');
            } else {
                long numItems = Long.MAX_VALUE;
                if (hasOption("numItems")) {
                    numItems = Long.parseLong(getOption("numItems"));
                    if (!hasOption("quiet")) {
                        writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
                    }
                }
                while (iterator.hasNext() && count < numItems) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    writer.append("Key: ").append(key);
                    String str = record.getSecond().toString();
                    writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
                    writer.write('\n');
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                if (!hasOption("quiet")) {
                    writer.append("Count: ").append(String.valueOf(count)).append('\n');
                }
            }
            if (facets != null) {
                List<String> keyList = Lists.newArrayListWithCapacity(facets.size());

                IntArrayList valueList = new IntArrayList(facets.size());
                facets.pairsSortedByKey(keyList, valueList);
                writer.append("-----Facets---\n");
                writer.append("Key\t\tCount\n");
                int i = 0;
                for (String key : keyList) {
                    writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
                }
            }
        }
        writer.flush();

    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:org.apache.mahout.utils.vectors.VectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**//from w w  w.  j av a  2s.  co m
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
     abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
     "The Sequence File containing the Vectors").withShortName("s").create();
     Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
     .withDescription("The directory containing Sequence File of Vectors")
     .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
        } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
        } else {
            //TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>(
                    path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters != null && vector instanceof NamedVector
                        && !filters.contains(((NamedVector) vector).getName())) {
                    //we are filtering out this item, skip
                    continue;
                }
                if (sizeOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write(":");
                    } else {
                        writer.write(String.valueOf(i++));
                        writer.write(":");
                    }
                    writer.write(String.valueOf(vector.size()));
                    writer.write('\n');
                } else if (nameOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write('\n');
                    }
                } else {
                    String fmtStr;
                    if (useCSV) {
                        fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                    } else {
                        fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                sortVectors);
                    }
                    writer.write(fmtStr);
                    writer.write('\n');
                }
                itemCount++;
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:org.apache.nutch.util.SegmentReaderUtil.java

License:Apache License

public static SequenceFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));
    Arrays.sort(names);//  w ww  . j av  a2s.c  om
    SequenceFile.Reader[] parts = new SequenceFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new SequenceFile.Reader(conf, SequenceFile.Reader.file(names[i]));
    }
    return parts;
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

void chgrp(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String user, String group,
        boolean dirFiles, boolean recursive) throws ActionExecutorException {

    HashMap<String, String> argsMap = new HashMap<String, String>();
    argsMap.put("user", user);
    argsMap.put("group", group);
    try {// ww w . java 2 s.c  om
        FileSystem fs = getFileSystemFor(path, context, fsConf);
        path = resolveToFullPath(nameNodePath, path, true);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
        if (pathArr == null || pathArr.length == 0) {
            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009",
                    "chgrp" + ", path(s) that matches [{0}] does not exist", path);
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            recursiveFsOperation("chgrp", fs, nameNodePath, p, argsMap, dirFiles, recursive, true);
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

/**
 * Delete path/* ww w.  j  av  a2s .com*/
 *
 * @param context
 * @param fsConf
 * @param nameNodePath
 * @param path
 * @throws ActionExecutorException
 */
public void delete(Context context, XConfiguration fsConf, Path nameNodePath, Path path, boolean skipTrash)
        throws ActionExecutorException {
    URI uri = path.toUri();
    URIHandler handler;
    try {
        handler = Services.get().get(URIHandlerService.class).getURIHandler(uri);
        if (handler instanceof FSURIHandler) {
            // Use legacy code to handle hdfs partition deletion
            path = resolveToFullPath(nameNodePath, path, true);
            final FileSystem fs = getFileSystemFor(path, context, fsConf);
            Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
            if (pathArr != null && pathArr.length > 0) {
                checkGlobMax(pathArr);
                for (final Path p : pathArr) {
                    if (fs.exists(p)) {
                        if (!skipTrash) {
                            // Moving directory/file to trash of user.
                            UserGroupInformationService ugiService = Services.get()
                                    .get(UserGroupInformationService.class);
                            UserGroupInformation ugi = ugiService
                                    .getProxyUser(fs.getConf().get(OozieClient.USER_NAME));
                            ugi.doAs(new PrivilegedExceptionAction<FileSystem>() {
                                @Override
                                public FileSystem run() throws Exception {
                                    Trash trash = new Trash(fs.getConf());
                                    if (!trash.moveToTrash(p)) {
                                        throw new ActionExecutorException(
                                                ActionExecutorException.ErrorType.ERROR, "FS005",
                                                "Could not move path [{0}] to trash on delete", p);
                                    }
                                    return null;
                                }
                            });
                        } else if (!fs.delete(p, true)) {
                            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS005",
                                    "delete, path [{0}] could not delete path", p);
                        }
                    }
                }
            }
        } else {
            handler.delete(uri, handler.getContext(uri, fsConf, context.getWorkflow().getUser(), false));
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

/**
 * Move source to target/* w  w w.ja va  2 s.co  m*/
 *
 * @param context
 * @param fsConf
 * @param nameNodePath
 * @param source
 * @param target
 * @param recovery
 * @throws ActionExecutorException
 */
public void move(Context context, XConfiguration fsConf, Path nameNodePath, Path source, Path target,
        boolean recovery) throws ActionExecutorException {
    try {
        source = resolveToFullPath(nameNodePath, source, true);
        validateSameNN(source, target);
        FileSystem fs = getFileSystemFor(source, context, fsConf);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(source));
        if ((pathArr == null || pathArr.length == 0)) {
            if (!recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS006",
                        "move, source path [{0}] does not exist", source);
            } else {
                return;
            }
        }
        if (pathArr.length > 1 && (!fs.exists(target) || fs.isFile(target))) {
            if (!recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS012",
                        "move, could not rename multiple sources to the same target name");
            } else {
                return;
            }
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            if (!fs.rename(p, target) && !recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS008",
                        "move, could not move [{0}] to [{1}]", p, target);
            }
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

void chmod(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String permissions,
        boolean dirFiles, boolean recursive) throws ActionExecutorException {

    HashMap<String, String> argsMap = new HashMap<String, String>();
    argsMap.put("permissions", permissions);
    try {/* w w  w  .ja va  2s.c o  m*/
        FileSystem fs = getFileSystemFor(path, context, fsConf);
        path = resolveToFullPath(nameNodePath, path, true);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
        if (pathArr == null || pathArr.length == 0) {
            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009",
                    "chmod" + ", path(s) that matches [{0}] does not exist", path);
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            recursiveFsOperation("chmod", fs, nameNodePath, p, argsMap, dirFiles, recursive, true);
        }

    } catch (Exception ex) {
        throw convertException(ex);
    }
}