List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java
License:Apache License
/** * Read the document frequency List which is built at the end of the DF Count Job. This will use constant * memory and will run at the speed of your disk read * /* w ww . j ava 2 s . c o m*/ * @param featureCountPath * @param dictionaryPathBase * @throws IOException */ private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = new ArrayList<Path>(); IntWritable key = new IntWritable(); LongWritable value = new LongWritable(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath, OUTPUT_FILES_PATTERN)); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // key is feature value is count while (reader.next(key, value)) { if (currentChunkSize > chunkSizeLimit) { freqWriter.close(); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } } featureCount++; freqWriter.close(); Long[] counts = { featureCount, vectorCount }; return new Pair<Long[], List<Path>>(counts, chunkPaths); }
From source file:org.apache.mahout.utils.vectors.VectorDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { /**//from w w w .j ava2 s .c o m Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument( abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription( "The Sequence File containing the Vectors").withShortName("s").create(); Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument( abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create()) .withDescription("The directory containing Sequence File of Vectors") .withShortName("d").create(); */ addInputOption(); addOutputOption(); addOption("useKey", "u", "If the Key is a vector than dump that instead"); addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true"); addOption("dictionary", "d", "The dictionary file.", false); addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false); addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries"); addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " + "(if the vector is one) printing out the name"); addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)"); addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order"); addOption("quiet", "q", "Print only file contents"); addOption("sizeOnly", "sz", "Dump only the size of the vector"); addOption("numItems", "ni", "Output at most <n> vecors", false); addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort", false); addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path input = getInputPath(); FileStatus fileStatus = fs.getFileStatus(input); if (fileStatus.isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter())); } else { FileStatus[] inputPaths = fs.globStatus(input); pathArr = new Path[inputPaths.length]; int i = 0; for (FileStatus fstatus : inputPaths) { pathArr[i++] = fstatus.getPath(); } } String dictionaryType = getOption("dictionaryType", "text"); boolean sortVectors = hasOption("sortVectors"); boolean quiet = hasOption("quiet"); if (!quiet) { log.info("Sort? {}", sortVectors); } String[] dictionary = null; if (hasOption("dictionary")) { String dictFile = getOption("dictionary"); if ("text".equals(dictionaryType)) { dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); } else if ("sequencefile".equals(dictionaryType)) { dictionary = VectorHelper.loadTermDictionary(conf, dictFile); } else { //TODO: support Lucene's FST as a dictionary type throw new IOException("Invalid dictionary type: " + dictionaryType); } } Set<String> filters; if (hasOption("filter")) { filters = Sets.newHashSet(getOptions("filter")); } else { filters = null; } boolean useCSV = hasOption("csv"); boolean sizeOnly = hasOption("sizeOnly"); boolean nameOnly = hasOption("nameOnly"); boolean namesAsComments = hasOption("namesAsComments"); boolean transposeKeyValue = hasOption("vectorAsKey"); Writer writer; boolean shouldClose; File output = getOutputFile(); if (output != null) { shouldClose = true; log.info("Output file: {}", output); Files.createParentDirs(output); writer = Files.newWriter(output, Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { boolean printKey = hasOption("printKey"); if (useCSV && dictionary != null) { writer.write("#"); for (int j = 0; j < dictionary.length; j++) { writer.write(dictionary[j]); if (j < dictionary.length - 1) { writer.write(','); } } writer.write('\n'); } Long numItems = null; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (quiet) { writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n'); } } int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize")) : Integer.MAX_VALUE; long itemCount = 0; int fileCount = 0; for (Path path : pathArr) { if (numItems != null && numItems <= itemCount) { break; } if (quiet) { log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length); } SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>( path, true, conf); Iterator<Pair<Writable, Writable>> iterator = iterable.iterator(); long i = 0; while (iterator.hasNext() && (numItems == null || itemCount < numItems)) { Pair<Writable, Writable> record = iterator.next(); Writable keyWritable = record.getFirst(); Writable valueWritable = record.getSecond(); if (printKey) { Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable; writer.write(notTheVectorWritable.toString()); writer.write('\t'); } Vector vector; try { vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get(); } catch (ClassCastException e) { if ((transposeKeyValue ? keyWritable : valueWritable) instanceof WeightedPropertyVectorWritable) { vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).getVector(); } else { throw e; } } if (filters != null && vector instanceof NamedVector && !filters.contains(((NamedVector) vector).getName())) { //we are filtering out this item, skip continue; } if (sizeOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write(":"); } else { writer.write(String.valueOf(i++)); writer.write(":"); } writer.write(String.valueOf(vector.size())); writer.write('\n'); } else if (nameOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write('\n'); } } else { String fmtStr; if (useCSV) { fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); } else { fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, sortVectors); } writer.write(fmtStr); writer.write('\n'); } itemCount++; } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:org.apache.nifi.processors.hadoop.DeleteHDFS.java
License:Apache License
@Override public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException { final FlowFile originalFlowFile = session.get(); // If this processor has an incoming connection, then do not run unless a // FlowFile is actually sent through if (originalFlowFile == null && context.hasIncomingConnection()) { context.yield();/*from ww w . j av a2 s . c om*/ return; } // We need a FlowFile to report provenance correctly. FlowFile flowFile = originalFlowFile != null ? originalFlowFile : session.create(); final String fileOrDirectoryName = context.getProperty(FILE_OR_DIRECTORY) .evaluateAttributeExpressions(flowFile).getValue(); final FileSystem fileSystem = getFileSystem(); try { // Check if the user has supplied a file or directory pattern List<Path> pathList = Lists.newArrayList(); if (GLOB_MATCHER.reset(fileOrDirectoryName).find()) { FileStatus[] fileStatuses = fileSystem.globStatus(new Path(fileOrDirectoryName)); if (fileStatuses != null) { for (FileStatus fileStatus : fileStatuses) { pathList.add(fileStatus.getPath()); } } } else { pathList.add(new Path(fileOrDirectoryName)); } int failedPath = 0; for (Path path : pathList) { if (fileSystem.exists(path)) { try { Map<String, String> attributes = Maps.newHashMapWithExpectedSize(2); attributes.put("hdfs.filename", path.getName()); attributes.put("hdfs.path", path.getParent().toString()); flowFile = session.putAllAttributes(flowFile, attributes); fileSystem.delete(path, context.getProperty(RECURSIVE).asBoolean()); getLogger().debug("For flowfile {} Deleted file at path {} with name {}", new Object[] { originalFlowFile, path.getParent().toString(), path.getName() }); final Path qualifiedPath = path.makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory()); session.getProvenanceReporter().invokeRemoteProcess(flowFile, qualifiedPath.toString()); } catch (IOException ioe) { // One possible scenario is that the IOException is permissions based, however it would be impractical to check every possible // external HDFS authorization tool (Ranger, Sentry, etc). Local ACLs could be checked but the operation would be expensive. getLogger().warn("Failed to delete file or directory", ioe); Map<String, String> attributes = Maps.newHashMapWithExpectedSize(1); // The error message is helpful in understanding at a flowfile level what caused the IOException (which ACL is denying the operation, e.g.) attributes.put("hdfs.error.message", ioe.getMessage()); session.transfer(session.putAllAttributes(session.clone(flowFile), attributes), REL_FAILURE); failedPath++; } } } if (failedPath == 0) { session.transfer(flowFile, DeleteHDFS.REL_SUCCESS); } else { // If any path has been failed to be deleted, remove the FlowFile as it's been cloned and sent to failure. session.remove(flowFile); } } catch (IOException e) { getLogger().error("Error processing delete for flowfile {} due to {}", new Object[] { flowFile, e.getMessage() }, e); session.transfer(flowFile, DeleteHDFS.REL_FAILURE); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
void chgrp(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String user, String group, boolean dirFiles, boolean recursive) throws ActionExecutorException { HashMap<String, String> argsMap = new HashMap<String, String>(); argsMap.put("user", user); argsMap.put("group", group); try {/*w w w. j a v a 2 s . com*/ FileSystem fs = getFileSystemFor(path, context, fsConf); path = resolveToFullPath(nameNodePath, path, true); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path)); if (pathArr == null || pathArr.length == 0) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009", "chgrp" + ", path(s) that matches [{0}] does not exist", path); } checkGlobMax(pathArr); for (Path p : pathArr) { recursiveFsOperation("chgrp", fs, nameNodePath, p, argsMap, dirFiles, recursive, true); } } catch (Exception ex) { throw convertException(ex); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
/** * Delete path/*from w ww .j a va 2 s. co m*/ * * @param context * @param fsConf * @param nameNodePath * @param path * @throws ActionExecutorException */ public void delete(Context context, XConfiguration fsConf, Path nameNodePath, Path path, boolean skipTrash) throws ActionExecutorException { URI uri = path.toUri(); URIHandler handler; try { handler = Services.get().get(URIHandlerService.class).getURIHandler(uri); if (handler instanceof FSURIHandler) { // Use legacy code to handle hdfs partition deletion path = resolveToFullPath(nameNodePath, path, true); final FileSystem fs = getFileSystemFor(path, context, fsConf); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path)); if (pathArr != null && pathArr.length > 0) { checkGlobMax(pathArr); for (final Path p : pathArr) { if (fs.exists(p)) { if (!skipTrash) { // Moving directory/file to trash of user. UserGroupInformationService ugiService = Services.get() .get(UserGroupInformationService.class); UserGroupInformation ugi = ugiService .getProxyUser(fs.getConf().get(OozieClient.USER_NAME)); ugi.doAs(new PrivilegedExceptionAction<FileSystem>() { @Override public FileSystem run() throws Exception { Trash trash = new Trash(fs.getConf()); if (!trash.moveToTrash(p)) { throw new ActionExecutorException( ActionExecutorException.ErrorType.ERROR, "FS005", "Could not move path [{0}] to trash on delete", p); } return null; } }); } else if (!fs.delete(p, true)) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS005", "delete, path [{0}] could not delete path", p); } } } } } else { handler.delete(uri, handler.getContext(uri, fsConf, context.getWorkflow().getUser(), false)); } } catch (Exception ex) { throw convertException(ex); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
/** * Move source to target/*from www . java 2 s .c o m*/ * * @param context * @param fsConf * @param nameNodePath * @param source * @param target * @param recovery * @throws ActionExecutorException */ public void move(Context context, XConfiguration fsConf, Path nameNodePath, Path source, Path target, boolean recovery) throws ActionExecutorException { try { source = resolveToFullPath(nameNodePath, source, true); validateSameNN(source, target); FileSystem fs = getFileSystemFor(source, context, fsConf); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(source)); if ((pathArr == null || pathArr.length == 0)) { if (!recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS006", "move, source path [{0}] does not exist", source); } else { return; } } if (pathArr.length > 1 && (!fs.exists(target) || fs.isFile(target))) { if (!recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS012", "move, could not rename multiple sources to the same target name"); } else { return; } } checkGlobMax(pathArr); for (Path p : pathArr) { if (!fs.rename(p, target) && !recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS008", "move, could not move [{0}] to [{1}]", p, target); } } } catch (Exception ex) { throw convertException(ex); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
void chmod(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String permissions, boolean dirFiles, boolean recursive) throws ActionExecutorException { HashMap<String, String> argsMap = new HashMap<String, String>(); argsMap.put("permissions", permissions); try {/*from ww w. j av a 2s.c o m*/ FileSystem fs = getFileSystemFor(path, context, fsConf); path = resolveToFullPath(nameNodePath, path, true); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path)); if (pathArr == null || pathArr.length == 0) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009", "chmod" + ", path(s) that matches [{0}] does not exist", path); } checkGlobMax(pathArr); for (Path p : pathArr) { recursiveFsOperation("chmod", fs, nameNodePath, p, argsMap, dirFiles, recursive, true); } } catch (Exception ex) { throw convertException(ex); } }
From source file:org.apache.oozie.action.hadoop.FSLauncherURIHandler.java
License:Apache License
@Override public boolean delete(URI uri, Configuration conf) throws LauncherException { boolean status = false; try {// ww w . j a va 2 s . co m FileSystem fs = FileSystem.get(uri, conf); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(getNormalizedPath(uri))); if (pathArr != null && pathArr.length > 0) { int fsGlobMax = conf.getInt(LauncherMapper.CONF_OOZIE_ACTION_FS_GLOB_MAX, 1000); if (pathArr.length > fsGlobMax) { throw new LauncherException( "exceeds max number (" + fsGlobMax + ") of files/dirs to delete in <prepare>"); } for (Path path : pathArr) { if (fs.exists(path)) { status = fs.delete(path, true); if (status) { System.out.println("Deletion of path " + path + " succeeded."); } else { System.out.println("Deletion of path " + path + " failed."); } } } } } catch (IOException e) { throw new LauncherException("Deletion of path " + uri + " failed.", e); } return status; }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator.java
License:Apache License
/** * Get the input size for as many inputs as possible. Inputs that do not report * their size nor can pig look that up itself are excluded from this size. *//*from ww w .j a v a 2 s .com*/ static long getTotalInputFileSize(Configuration conf, List<POLoad> lds, Job job) throws IOException { long totalInputFileSize = 0; for (POLoad ld : lds) { long size = getInputSizeFromLoader(ld, job); if (size > -1) { totalInputFileSize += size; continue; } else { // the input file location might be a list of comma separated files, // separate them out for (String location : LoadFunc.getPathStrings(ld.getLFile().getFileName())) { if (UriUtil.isHDFSFileOrLocalOrS3N(location, conf)) { Path path = new Path(location); FileSystem fs = path.getFileSystem(conf); FileStatus[] status = fs.globStatus(path); if (status != null) { for (FileStatus s : status) { totalInputFileSize += MapRedUtil.getPathLength(fs, s); } } } else { // If we cannot estimate size of a location, we should report -1 return -1; } } } } return totalInputFileSize; }
From source file:org.apache.pig.builtin.AvroStorage.java
License:Apache License
/** * Reads the avro schemas at the specified location. * @param p Location of file//from w w w. j ava 2 s . co m * @param job Hadoop job object * @return an Avro Schema object derived from the specified file * @throws IOException * */ public Schema getAvroSchema(final Path[] p, final Job job) throws IOException { GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>(); ArrayList<FileStatus> statusList = new ArrayList<FileStatus>(); FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration()); for (Path temp : p) { for (FileStatus tempf : fs.globStatus(temp)) { statusList.add(tempf); } } FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]); if (statusArray == null) { throw new IOException("Path " + p.toString() + " does not exist."); } if (statusArray.length == 0) { throw new IOException("No path matches pattern " + p.toString()); } Path filePath = Utils.depthFirstSearchForFile(statusArray, fs); if (filePath == null) { throw new IOException("No path matches pattern " + p.toString()); } InputStream hdfsInputStream = fs.open(filePath); DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader); Schema s = avroDataStream.getSchema(); avroDataStream.close(); return s; }