Example usage for org.apache.hadoop.fs FileUtil stat2Paths

List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil stat2Paths.

Prototype

public static Path[] stat2Paths(FileStatus[] stats) 

Source Link

Document

convert an array of FileStatus to an array of Path

Usage

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java

License:Apache License

/**
 * Converts the sequence files present in a directory to a {@link HmmModel} model.
 *
 * @param nrOfHiddenStates Number of hidden states
 * @param nrOfOutputStates Number of output states
 * @param modelPath        Location of the sequence files containing the model's distributions
 * @param conf             Configuration object
 * @return HmmModel the encoded model//  w  w  w  .j  a va 2  s . c o  m
 * @throws IOException
 */
public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");

    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());

    for (FileStatus match : matches) {
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == (int) 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }

    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);

    if (model != null) {
        return model;
    } else
        throw new IOException("Error building model from output location");

}

From source file:org.apache.mahout.clustering.classify.ClusterClassificationDriverTest.java

License:Apache License

private void collectVectorsForAssertion() throws IOException {
    Path[] partFilePaths = FileUtil.stat2Paths(fs.globStatus(classifiedOutputPath));
    FileStatus[] listStatus = fs.listStatus(partFilePaths, PathFilters.partFilter());
    for (FileStatus partFile : listStatus) {
        SequenceFile.Reader classifiedVectors = new SequenceFile.Reader(fs, partFile.getPath(), conf);
        Writable clusterIdAsKey = new IntWritable();
        WeightedPropertyVectorWritable point = new WeightedPropertyVectorWritable();
        while (classifiedVectors.next(clusterIdAsKey, point)) {
            collectVector(clusterIdAsKey.toString(), point.getVector());
        }//  w  w w . j a  va  2s .c  o  m
    }
}

From source file:org.apache.mahout.clustering.topdown.postprocessor.ClusterOutputPostProcessorTest.java

License:Apache License

private List<Vector> getVectorsInCluster(Path clusterPath) throws IOException {
    Path[] partFilePaths = FileUtil.stat2Paths(fs.globStatus(clusterPath));
    FileStatus[] listStatus = fs.listStatus(partFilePaths);
    List<Vector> vectors = Lists.newArrayList();
    for (FileStatus partFile : listStatus) {
        SequenceFile.Reader topLevelClusterReader = new SequenceFile.Reader(fs, partFile.getPath(), conf);
        Writable clusterIdAsKey = new LongWritable();
        VectorWritable point = new VectorWritable();
        while (topLevelClusterReader.next(clusterIdAsKey, point)) {
            vectors.add(point.get());//from w ww .j  a v a 2s.c  o  m
        }
    }
    return vectors;
}

From source file:org.apache.mahout.utils.SequenceFileDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*www  .jav a 2 s  .  c om*/
    addOutputOption();
    addOption("substring", "b", "The number of chars to print out per value", false);
    addOption(buildOption("count", "c", "Report the count only", false, false, null));
    addOption("numItems", "n", "Output at most <n> key value pairs", false);
    addOption(
            buildOption("facets", "fa", "Output the counts per key.  Note, if there are a lot of unique keys, "
                    + "this can take up a fair amount of memory", false, false, null));
    addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    Path input = getInputPath();
    FileSystem fs = input.getFileSystem(conf);
    if (fs.getFileStatus(input).isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        pathArr = new Path[1];
        pathArr[0] = input;
    }

    Writer writer;
    boolean shouldClose;
    if (hasOption("output")) {
        shouldClose = true;
        writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        for (Path path : pathArr) {
            if (!hasOption("quiet")) {
                writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
            }

            int sub = Integer.MAX_VALUE;
            if (hasOption("substring")) {
                sub = Integer.parseInt(getOption("substring"));
            }
            boolean countOnly = hasOption("count");
            SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true,
                    conf);
            if (!hasOption("quiet")) {
                writer.append("Key class: ").append(iterator.getKeyClass().toString());
                writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
            }
            OpenObjectIntHashMap<String> facets = null;
            if (hasOption("facets")) {
                facets = new OpenObjectIntHashMap<String>();
            }
            long count = 0;
            if (countOnly) {
                while (iterator.hasNext()) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                writer.append("Count: ").append(String.valueOf(count)).append('\n');
            } else {
                long numItems = Long.MAX_VALUE;
                if (hasOption("numItems")) {
                    numItems = Long.parseLong(getOption("numItems"));
                    if (!hasOption("quiet")) {
                        writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
                    }
                }
                while (iterator.hasNext() && count < numItems) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    writer.append("Key: ").append(key);
                    String str = record.getSecond().toString();
                    writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
                    writer.write('\n');
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                if (!hasOption("quiet")) {
                    writer.append("Count: ").append(String.valueOf(count)).append('\n');
                }
            }
            if (facets != null) {
                List<String> keyList = Lists.newArrayListWithCapacity(facets.size());

                IntArrayList valueList = new IntArrayList(facets.size());
                facets.pairsSortedByKey(keyList, valueList);
                writer.append("-----Facets---\n");
                writer.append("Key\t\tCount\n");
                int i = 0;
                for (String key : keyList) {
                    writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
                }
            }
        }
        writer.flush();

    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:org.apache.mahout.utils.vectors.VectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**//from w w  w.  j av a  2s.  co m
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
     abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
     "The Sequence File containing the Vectors").withShortName("s").create();
     Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
     .withDescription("The directory containing Sequence File of Vectors")
     .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
        } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
        } else {
            //TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>(
                    path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters != null && vector instanceof NamedVector
                        && !filters.contains(((NamedVector) vector).getName())) {
                    //we are filtering out this item, skip
                    continue;
                }
                if (sizeOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write(":");
                    } else {
                        writer.write(String.valueOf(i++));
                        writer.write(":");
                    }
                    writer.write(String.valueOf(vector.size()));
                    writer.write('\n');
                } else if (nameOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write('\n');
                    }
                } else {
                    String fmtStr;
                    if (useCSV) {
                        fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                    } else {
                        fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                sortVectors);
                    }
                    writer.write(fmtStr);
                    writer.write('\n');
                }
                itemCount++;
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:org.apache.nutch.util.SegmentReaderUtil.java

License:Apache License

public static SequenceFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));
    Arrays.sort(names);//  w ww  . j av  a2s.c  om
    SequenceFile.Reader[] parts = new SequenceFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new SequenceFile.Reader(conf, SequenceFile.Reader.file(names[i]));
    }
    return parts;
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

void chgrp(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String user, String group,
        boolean dirFiles, boolean recursive) throws ActionExecutorException {

    HashMap<String, String> argsMap = new HashMap<String, String>();
    argsMap.put("user", user);
    argsMap.put("group", group);
    try {// ww w . java 2 s.c  om
        FileSystem fs = getFileSystemFor(path, context, fsConf);
        path = resolveToFullPath(nameNodePath, path, true);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
        if (pathArr == null || pathArr.length == 0) {
            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009",
                    "chgrp" + ", path(s) that matches [{0}] does not exist", path);
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            recursiveFsOperation("chgrp", fs, nameNodePath, p, argsMap, dirFiles, recursive, true);
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

/**
 * Delete path/* ww w.  j  av  a2s .com*/
 *
 * @param context
 * @param fsConf
 * @param nameNodePath
 * @param path
 * @throws ActionExecutorException
 */
public void delete(Context context, XConfiguration fsConf, Path nameNodePath, Path path, boolean skipTrash)
        throws ActionExecutorException {
    URI uri = path.toUri();
    URIHandler handler;
    try {
        handler = Services.get().get(URIHandlerService.class).getURIHandler(uri);
        if (handler instanceof FSURIHandler) {
            // Use legacy code to handle hdfs partition deletion
            path = resolveToFullPath(nameNodePath, path, true);
            final FileSystem fs = getFileSystemFor(path, context, fsConf);
            Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
            if (pathArr != null && pathArr.length > 0) {
                checkGlobMax(pathArr);
                for (final Path p : pathArr) {
                    if (fs.exists(p)) {
                        if (!skipTrash) {
                            // Moving directory/file to trash of user.
                            UserGroupInformationService ugiService = Services.get()
                                    .get(UserGroupInformationService.class);
                            UserGroupInformation ugi = ugiService
                                    .getProxyUser(fs.getConf().get(OozieClient.USER_NAME));
                            ugi.doAs(new PrivilegedExceptionAction<FileSystem>() {
                                @Override
                                public FileSystem run() throws Exception {
                                    Trash trash = new Trash(fs.getConf());
                                    if (!trash.moveToTrash(p)) {
                                        throw new ActionExecutorException(
                                                ActionExecutorException.ErrorType.ERROR, "FS005",
                                                "Could not move path [{0}] to trash on delete", p);
                                    }
                                    return null;
                                }
                            });
                        } else if (!fs.delete(p, true)) {
                            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS005",
                                    "delete, path [{0}] could not delete path", p);
                        }
                    }
                }
            }
        } else {
            handler.delete(uri, handler.getContext(uri, fsConf, context.getWorkflow().getUser(), false));
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

/**
 * Move source to target/* w  w w.ja va  2 s.co  m*/
 *
 * @param context
 * @param fsConf
 * @param nameNodePath
 * @param source
 * @param target
 * @param recovery
 * @throws ActionExecutorException
 */
public void move(Context context, XConfiguration fsConf, Path nameNodePath, Path source, Path target,
        boolean recovery) throws ActionExecutorException {
    try {
        source = resolveToFullPath(nameNodePath, source, true);
        validateSameNN(source, target);
        FileSystem fs = getFileSystemFor(source, context, fsConf);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(source));
        if ((pathArr == null || pathArr.length == 0)) {
            if (!recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS006",
                        "move, source path [{0}] does not exist", source);
            } else {
                return;
            }
        }
        if (pathArr.length > 1 && (!fs.exists(target) || fs.isFile(target))) {
            if (!recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS012",
                        "move, could not rename multiple sources to the same target name");
            } else {
                return;
            }
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            if (!fs.rename(p, target) && !recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS008",
                        "move, could not move [{0}] to [{1}]", p, target);
            }
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

void chmod(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String permissions,
        boolean dirFiles, boolean recursive) throws ActionExecutorException {

    HashMap<String, String> argsMap = new HashMap<String, String>();
    argsMap.put("permissions", permissions);
    try {/* w w  w  .ja va  2s.c o  m*/
        FileSystem fs = getFileSystemFor(path, context, fsConf);
        path = resolveToFullPath(nameNodePath, path, true);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
        if (pathArr == null || pathArr.length == 0) {
            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009",
                    "chmod" + ", path(s) that matches [{0}] does not exist", path);
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            recursiveFsOperation("chmod", fs, nameNodePath, p, argsMap, dirFiles, recursive, true);
        }

    } catch (Exception ex) {
        throw convertException(ex);
    }
}