List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:com.constellio.sdk.tests.FactoriesTestFeatures.java
private void deleteFromHadoop(String user, String url) { System.setProperty("HADOOP_USER_NAME", user); Configuration hadoopConfig = new Configuration(); if (url == null || user == null) { throw new RuntimeException("No config"); }/*from w ww .j av a 2s . c o m*/ hadoopConfig.set("fs.defaultFS", url); hadoopConfig.set("hadoop.job.ugi", user); try { FileSystem hdfs = FileSystem.get(hadoopConfig); for (FileStatus file : hdfs.globStatus(new Path("*"))) { hdfs.delete(file.getPath(), true); } } catch (IOException e) { throw new RuntimeException(e); } }
From source file:com.conversantmedia.mapreduce.example.PrepareInputsExample.java
License:Apache License
@DriverInit public void copyFilesToWorking() throws IOException { // Copy the input files into the 'workingDir' FileSystem fs = FileSystem.get(getConf()); this.workingDirectory = new Path("/tmp/" + UUID.randomUUID().toString()); fs.mkdirs(workingDirectory);//from w ww . j av a 2 s . c om FileStatus[] files = fs.globStatus(new Path(context.getInput())); for (FileStatus file : files) { Path dest = new Path(workingDirectory, file.getPath().getName()); FileUtil.copy(fs, file.getPath(), fs, dest, false, getConf()); } }
From source file:com.conversantmedia.mapreduce.tool.BaseTool.java
License:Apache License
protected List<FileStatus> getInputFiles(Path input) throws IOException { FileSystem fs = FileSystem.get(getConf()); List<FileStatus> status = new ArrayList<>(); if (fs.exists(input)) { FileStatus inputStatus = fs.getFileStatus(input); if (inputStatus.isDirectory()) { // Move all files under this directory status = Arrays.asList(fs.listStatus(input)); } else {/*from w w w. j a v a2 s.co m*/ status.add(inputStatus); } } // Must be a glob path else { FileStatus[] statusAry = fs.globStatus(input); status.addAll(Arrays.asList(statusAry)); } return status; }
From source file:com.datasalt.pangool.utils.HadoopUtils.java
License:Apache License
/** * Reads maps of integer -> double from glob paths like "folder/part-r*" *///w ww .j a v a 2s. c o m public static HashMap<Integer, Double> readIntDoubleMapFromGlob(Path glob, FileSystem fs) throws IOException { FileStatus status[] = fs.globStatus(glob); HashMap<Integer, Double> ret = new HashMap<Integer, Double>(); for (FileStatus fileS : status) { ret.putAll(readIntDoubleMap(fileS.getPath(), fs)); } return ret; }
From source file:com.datasalt.pangool.utils.HadoopUtils.java
License:Apache License
/** * Reads maps of integer -> integer from glob paths like "folder/part-r*" *//*from w ww .j av a2 s . c o m*/ public static HashMap<Integer, Integer> readIntIntMapFromGlob(Path glob, FileSystem fs) throws IOException { FileStatus status[] = fs.globStatus(glob); HashMap<Integer, Integer> ret = new HashMap<Integer, Integer>(); for (FileStatus fileS : status) { ret.putAll(readIntIntMap(fileS.getPath(), fs)); } return ret; }
From source file:com.ebay.erl.mobius.core.builder.AbstractDatasetBuilder.java
License:Apache License
/** * Add the <code>paths</code> to the underline dataset. A boolean * flag <code>validatePathExistance</code> to specify if Mobius * needs to verify the specified <code>paths</code> exist or not. * <p>/*from w ww. j av a 2s.c om*/ * * If <code>validatePathExistance</code> is true, and one of the * <code>paths</code> doesn't exist, <code>IOException</code> will * be thrown. * <p> * * If a path exists and it's a folder, {@link #checkTouchFile(FileSystem, Path)} * will be called to see if a touch file exists under that folder or not. * The default implementation of <code>checkTouchFile</code> always return * true, which means the dataset builder doesn't check touch file by default. * If this is a need to check touch file, the subclass should override that * function, and when the funciton return false, <code>IOException</code> * will be thrown here for that specific path. */ protected ACTUAL_BUILDER_IMPL addInputPath(boolean validatePathExistance, Path... paths) throws IOException { if (paths == null || paths.length == 0) { throw new IllegalArgumentException("Please specify at least one path"); } FileSystem fs = FileSystem.get(this.mobiusJob.getConf()); for (Path aPath : paths) { FileStatus[] fileStatus = null; try { fileStatus = fs.globStatus(aPath); } catch (NullPointerException e) { LOGGER.warn("FileSystem list globStatus thrown NPE", e); } if (fileStatus == null) { if (validatePathExistance) { throw new FileNotFoundException(aPath.toString() + " doesn't exist on file system."); } else { // no need to validate, as the input // for this dataset is coming from // the output of the other dataset. this.getDataset().addInputs(aPath); } } else { // file(s) exists, add inputs for (FileStatus aFileStatus : fileStatus) { Path p = aFileStatus.getPath(); if (!fs.isFile(p)) { if (!this.checkTouchFile(fs, p)) { throw new IllegalStateException( "No touch file under " + p.toString() + ", this dataset is not ready."); } else { this.getDataset().addInputs(p); } } else { this.getDataset().addInputs(p); } } } } return (ACTUAL_BUILDER_IMPL) this; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static int getNumTerms(Configuration conf, Path dictionaryPath) throws IOException { FileSystem fs = dictionaryPath.getFileSystem(conf); Text key = new Text(); IntWritable value = new IntWritable(); int maxTermId = -1; for (FileStatus stat : fs.globStatus(dictionaryPath)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, stat.getPath(), conf); while (reader.next(key, value)) { maxTermId = Math.max(maxTermId, value.get()); }/*from w w w . j a va 2 s .c o m*/ } return maxTermId + 1; }
From source file:com.flipkart.fdp.migration.distcp.utils.FileCountDriver.java
License:Apache License
public List<String> getAllFilePath(Path filePath, FileSystem fs, String destBasePath) throws IOException { List<String> fileList = new ArrayList<String>(); List<String> inputPaths = new ArrayList<String>(); FileStatus[] fileStatus = fs.globStatus(filePath); for (FileStatus fileStat : fileStatus) { if (fileStat.isFile()) { fileList.add(trimExtension(fileStat.getPath().toUri().getPath(), destBasePath)); } else {// w w w . j ava2 s .com //System.out.println("Found a directory : " + fileStat.getPath().toUri().getPath()); inputPaths.add(fileStat.getPath().toUri().getPath()); } } System.out.println("InputPaths size : " + inputPaths.size()); if (inputPaths.size() > 0) { for (String path : inputPaths) { List<String> fstat = getFileStatusRecursive(new Path(path), fs, destBasePath); fileList.addAll(fstat); } } return fileList; }
From source file:com.google.cloud.dataflow.sdk.io.hdfs.HDFSFileSource.java
License:Apache License
@Override public void validate() { if (validate) { try {/*from w ww . j a v a2 s. c o m*/ FileSystem fs = FileSystem.get(new URI(filepattern), Job.getInstance().getConfiguration()); FileStatus[] fileStatuses = fs.globStatus(new Path(filepattern)); checkState(fileStatuses != null && fileStatuses.length > 0, "Unable to find any files matching %s", filepattern); } catch (IOException | URISyntaxException e) { throw new RuntimeException(e); } } }
From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java
License:Apache License
private static Map<Integer, Cluster> loadClusters(ClusterJob job) throws Exception { Map<Integer, Cluster> result = new HashMap<Integer, Cluster>(); try {//ww w .j ava2 s .c o m FileSystem fs = job.output.getFileSystem(job.conf); for (FileStatus seqFile : fs.globStatus(new Path(job.output, "part-*"))) { Path path = seqFile.getPath(); //System.out.println("Input Path: " + path); doesn't this interfere with output? SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job.conf); try { Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance(); Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance(); while (reader.next(key, value)) { Cluster cluster = (Cluster) value; result.put(cluster.getId(), cluster); } } finally { reader.close(); } } } finally { } return result; }