List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths
public static Path[] stat2Paths(FileStatus[] stats)
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java
License:Apache License
/** * Converts the sequence files present in a directory to a {@link HmmModel} model. * * @param nrOfHiddenStates Number of hidden states * @param nrOfOutputStates Number of output states * @param modelPath Location of the sequence files containing the model's distributions * @param conf Configuration object * @return HmmModel the encoded model// w w w .j a va 2 s . c o m * @throws IOException */ public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter()); for (FileStatus match : matches) { result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == (int) 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); if (model != null) { return model; } else throw new IOException("Error building model from output location"); }
From source file:org.apache.mahout.clustering.classify.ClusterClassificationDriverTest.java
License:Apache License
private void collectVectorsForAssertion() throws IOException { Path[] partFilePaths = FileUtil.stat2Paths(fs.globStatus(classifiedOutputPath)); FileStatus[] listStatus = fs.listStatus(partFilePaths, PathFilters.partFilter()); for (FileStatus partFile : listStatus) { SequenceFile.Reader classifiedVectors = new SequenceFile.Reader(fs, partFile.getPath(), conf); Writable clusterIdAsKey = new IntWritable(); WeightedPropertyVectorWritable point = new WeightedPropertyVectorWritable(); while (classifiedVectors.next(clusterIdAsKey, point)) { collectVector(clusterIdAsKey.toString(), point.getVector()); }// w w w . j a va 2s .c o m } }
From source file:org.apache.mahout.clustering.topdown.postprocessor.ClusterOutputPostProcessorTest.java
License:Apache License
private List<Vector> getVectorsInCluster(Path clusterPath) throws IOException { Path[] partFilePaths = FileUtil.stat2Paths(fs.globStatus(clusterPath)); FileStatus[] listStatus = fs.listStatus(partFilePaths); List<Vector> vectors = Lists.newArrayList(); for (FileStatus partFile : listStatus) { SequenceFile.Reader topLevelClusterReader = new SequenceFile.Reader(fs, partFile.getPath(), conf); Writable clusterIdAsKey = new LongWritable(); VectorWritable point = new VectorWritable(); while (topLevelClusterReader.next(clusterIdAsKey, point)) { vectors.add(point.get());//from w ww .j a v a 2s.c o m } } return vectors; }
From source file:org.apache.mahout.utils.SequenceFileDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*www .jav a 2 s . c om*/ addOutputOption(); addOption("substring", "b", "The number of chars to print out per value", false); addOption(buildOption("count", "c", "Report the count only", false, false, null)); addOption("numItems", "n", "Output at most <n> key value pairs", false); addOption( buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, " + "this can take up a fair amount of memory", false, false, null)); addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); Path input = getInputPath(); FileSystem fs = input.getFileSystem(conf); if (fs.getFileStatus(input).isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter())); } else { pathArr = new Path[1]; pathArr[0] = input; } Writer writer; boolean shouldClose; if (hasOption("output")) { shouldClose = true; writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { for (Path path : pathArr) { if (!hasOption("quiet")) { writer.append("Input Path: ").append(String.valueOf(path)).append('\n'); } int sub = Integer.MAX_VALUE; if (hasOption("substring")) { sub = Integer.parseInt(getOption("substring")); } boolean countOnly = hasOption("count"); SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true, conf); if (!hasOption("quiet")) { writer.append("Key class: ").append(iterator.getKeyClass().toString()); writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n'); } OpenObjectIntHashMap<String> facets = null; if (hasOption("facets")) { facets = new OpenObjectIntHashMap<String>(); } long count = 0; if (countOnly) { while (iterator.hasNext()) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } writer.append("Count: ").append(String.valueOf(count)).append('\n'); } else { long numItems = Long.MAX_VALUE; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (!hasOption("quiet")) { writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n"); } } while (iterator.hasNext() && count < numItems) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); writer.append("Key: ").append(key); String str = record.getSecond().toString(); writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str); writer.write('\n'); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } if (!hasOption("quiet")) { writer.append("Count: ").append(String.valueOf(count)).append('\n'); } } if (facets != null) { List<String> keyList = Lists.newArrayListWithCapacity(facets.size()); IntArrayList valueList = new IntArrayList(facets.size()); facets.pairsSortedByKey(keyList, valueList); writer.append("-----Facets---\n"); writer.append("Key\t\tCount\n"); int i = 0; for (String key : keyList) { writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n'); } } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:org.apache.mahout.utils.vectors.VectorDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { /**//from w w w. j av a 2s. co m Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument( abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription( "The Sequence File containing the Vectors").withShortName("s").create(); Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument( abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create()) .withDescription("The directory containing Sequence File of Vectors") .withShortName("d").create(); */ addInputOption(); addOutputOption(); addOption("useKey", "u", "If the Key is a vector than dump that instead"); addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true"); addOption("dictionary", "d", "The dictionary file.", false); addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false); addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries"); addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " + "(if the vector is one) printing out the name"); addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)"); addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order"); addOption("quiet", "q", "Print only file contents"); addOption("sizeOnly", "sz", "Dump only the size of the vector"); addOption("numItems", "ni", "Output at most <n> vecors", false); addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort", false); addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path input = getInputPath(); FileStatus fileStatus = fs.getFileStatus(input); if (fileStatus.isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter())); } else { FileStatus[] inputPaths = fs.globStatus(input); pathArr = new Path[inputPaths.length]; int i = 0; for (FileStatus fstatus : inputPaths) { pathArr[i++] = fstatus.getPath(); } } String dictionaryType = getOption("dictionaryType", "text"); boolean sortVectors = hasOption("sortVectors"); boolean quiet = hasOption("quiet"); if (!quiet) { log.info("Sort? {}", sortVectors); } String[] dictionary = null; if (hasOption("dictionary")) { String dictFile = getOption("dictionary"); if ("text".equals(dictionaryType)) { dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); } else if ("sequencefile".equals(dictionaryType)) { dictionary = VectorHelper.loadTermDictionary(conf, dictFile); } else { //TODO: support Lucene's FST as a dictionary type throw new IOException("Invalid dictionary type: " + dictionaryType); } } Set<String> filters; if (hasOption("filter")) { filters = Sets.newHashSet(getOptions("filter")); } else { filters = null; } boolean useCSV = hasOption("csv"); boolean sizeOnly = hasOption("sizeOnly"); boolean nameOnly = hasOption("nameOnly"); boolean namesAsComments = hasOption("namesAsComments"); boolean transposeKeyValue = hasOption("vectorAsKey"); Writer writer; boolean shouldClose; File output = getOutputFile(); if (output != null) { shouldClose = true; log.info("Output file: {}", output); Files.createParentDirs(output); writer = Files.newWriter(output, Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { boolean printKey = hasOption("printKey"); if (useCSV && dictionary != null) { writer.write("#"); for (int j = 0; j < dictionary.length; j++) { writer.write(dictionary[j]); if (j < dictionary.length - 1) { writer.write(','); } } writer.write('\n'); } Long numItems = null; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (quiet) { writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n'); } } int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize")) : Integer.MAX_VALUE; long itemCount = 0; int fileCount = 0; for (Path path : pathArr) { if (numItems != null && numItems <= itemCount) { break; } if (quiet) { log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length); } SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>( path, true, conf); Iterator<Pair<Writable, Writable>> iterator = iterable.iterator(); long i = 0; while (iterator.hasNext() && (numItems == null || itemCount < numItems)) { Pair<Writable, Writable> record = iterator.next(); Writable keyWritable = record.getFirst(); Writable valueWritable = record.getSecond(); if (printKey) { Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable; writer.write(notTheVectorWritable.toString()); writer.write('\t'); } Vector vector; try { vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get(); } catch (ClassCastException e) { if ((transposeKeyValue ? keyWritable : valueWritable) instanceof WeightedPropertyVectorWritable) { vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).getVector(); } else { throw e; } } if (filters != null && vector instanceof NamedVector && !filters.contains(((NamedVector) vector).getName())) { //we are filtering out this item, skip continue; } if (sizeOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write(":"); } else { writer.write(String.valueOf(i++)); writer.write(":"); } writer.write(String.valueOf(vector.size())); writer.write('\n'); } else if (nameOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write('\n'); } } else { String fmtStr; if (useCSV) { fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); } else { fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, sortVectors); } writer.write(fmtStr); writer.write('\n'); } itemCount++; } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:org.apache.nutch.util.SegmentReaderUtil.java
License:Apache License
public static SequenceFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException { FileSystem fs = dir.getFileSystem(conf); Path[] names = FileUtil.stat2Paths(fs.listStatus(dir)); Arrays.sort(names);// w ww . j av a2s.c om SequenceFile.Reader[] parts = new SequenceFile.Reader[names.length]; for (int i = 0; i < names.length; i++) { parts[i] = new SequenceFile.Reader(conf, SequenceFile.Reader.file(names[i])); } return parts; }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
void chgrp(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String user, String group, boolean dirFiles, boolean recursive) throws ActionExecutorException { HashMap<String, String> argsMap = new HashMap<String, String>(); argsMap.put("user", user); argsMap.put("group", group); try {// ww w . java 2 s.c om FileSystem fs = getFileSystemFor(path, context, fsConf); path = resolveToFullPath(nameNodePath, path, true); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path)); if (pathArr == null || pathArr.length == 0) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009", "chgrp" + ", path(s) that matches [{0}] does not exist", path); } checkGlobMax(pathArr); for (Path p : pathArr) { recursiveFsOperation("chgrp", fs, nameNodePath, p, argsMap, dirFiles, recursive, true); } } catch (Exception ex) { throw convertException(ex); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
/** * Delete path/* ww w. j av a2s .com*/ * * @param context * @param fsConf * @param nameNodePath * @param path * @throws ActionExecutorException */ public void delete(Context context, XConfiguration fsConf, Path nameNodePath, Path path, boolean skipTrash) throws ActionExecutorException { URI uri = path.toUri(); URIHandler handler; try { handler = Services.get().get(URIHandlerService.class).getURIHandler(uri); if (handler instanceof FSURIHandler) { // Use legacy code to handle hdfs partition deletion path = resolveToFullPath(nameNodePath, path, true); final FileSystem fs = getFileSystemFor(path, context, fsConf); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path)); if (pathArr != null && pathArr.length > 0) { checkGlobMax(pathArr); for (final Path p : pathArr) { if (fs.exists(p)) { if (!skipTrash) { // Moving directory/file to trash of user. UserGroupInformationService ugiService = Services.get() .get(UserGroupInformationService.class); UserGroupInformation ugi = ugiService .getProxyUser(fs.getConf().get(OozieClient.USER_NAME)); ugi.doAs(new PrivilegedExceptionAction<FileSystem>() { @Override public FileSystem run() throws Exception { Trash trash = new Trash(fs.getConf()); if (!trash.moveToTrash(p)) { throw new ActionExecutorException( ActionExecutorException.ErrorType.ERROR, "FS005", "Could not move path [{0}] to trash on delete", p); } return null; } }); } else if (!fs.delete(p, true)) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS005", "delete, path [{0}] could not delete path", p); } } } } } else { handler.delete(uri, handler.getContext(uri, fsConf, context.getWorkflow().getUser(), false)); } } catch (Exception ex) { throw convertException(ex); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
/** * Move source to target/* w w w.ja va 2 s.co m*/ * * @param context * @param fsConf * @param nameNodePath * @param source * @param target * @param recovery * @throws ActionExecutorException */ public void move(Context context, XConfiguration fsConf, Path nameNodePath, Path source, Path target, boolean recovery) throws ActionExecutorException { try { source = resolveToFullPath(nameNodePath, source, true); validateSameNN(source, target); FileSystem fs = getFileSystemFor(source, context, fsConf); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(source)); if ((pathArr == null || pathArr.length == 0)) { if (!recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS006", "move, source path [{0}] does not exist", source); } else { return; } } if (pathArr.length > 1 && (!fs.exists(target) || fs.isFile(target))) { if (!recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS012", "move, could not rename multiple sources to the same target name"); } else { return; } } checkGlobMax(pathArr); for (Path p : pathArr) { if (!fs.rename(p, target) && !recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS008", "move, could not move [{0}] to [{1}]", p, target); } } } catch (Exception ex) { throw convertException(ex); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
void chmod(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String permissions, boolean dirFiles, boolean recursive) throws ActionExecutorException { HashMap<String, String> argsMap = new HashMap<String, String>(); argsMap.put("permissions", permissions); try {/* w w w .ja va 2s.c o m*/ FileSystem fs = getFileSystemFor(path, context, fsConf); path = resolveToFullPath(nameNodePath, path, true); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path)); if (pathArr == null || pathArr.length == 0) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009", "chmod" + ", path(s) that matches [{0}] does not exist", path); } checkGlobMax(pathArr); for (Path p : pathArr) { recursiveFsOperation("chmod", fs, nameNodePath, p, argsMap, dirFiles, recursive, true); } } catch (Exception ex) { throw convertException(ex); } }