List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.godhuli.rhipe.FileUtils.java
License:Apache License
public void copyMain(String src, String dest) throws IOException { File dst = new File(dest); Path srcpath = new Path(src); FileSystem srcFS = FileSystem.get(cfg); FileStatus[] srcs = srcFS.globStatus(srcpath); boolean dstIsDir = dst.isDirectory(); if (srcs.length > 1 && !dstIsDir) { throw new IOException("When copying multiple files, " + "destination should be a directory."); }//from w w w. j a v a 2 s . c o m for (FileStatus status : srcs) { Path p = status.getPath(); File f = dstIsDir ? new File(dst, p.getName()) : dst; copyToLocal(srcFS, p, f); } }
From source file:org.godhuli.rhipe.FileUtils.java
License:Apache License
private void ls__(String path, ArrayList<String> lsco, boolean dorecurse) throws IOException, FileNotFoundException, URISyntaxException { Path spath = null;/*from ww w . j a va2 s . co m*/ spath = new Path(path); FileSystem srcFS = spath.getFileSystem(getConf()); FileStatus[] srcs; srcs = srcFS.globStatus(spath); if (srcs == null || srcs.length == 0) { throw new FileNotFoundException("Cannot access " + path + ": No such file or directory."); } if (srcs.length == 1 && srcs[0].isDir()) srcs = srcFS.listStatus(srcs[0].getPath()); Calendar c = Calendar.getInstance(); for (FileStatus status : srcs) { StringBuilder sb = new StringBuilder(); boolean idir = status.isDir(); String x = idir ? "d" : "-"; if (dorecurse && idir) ls__(status.getPath().toUri().getPath(), lsco, dorecurse); else { sb.append(x); sb.append(status.getPermission().toString()); sb.append(fsep); sb.append(status.getOwner()); sb.append(fsep); sb.append(status.getGroup()); sb.append(fsep); sb.append(status.getLen()); sb.append(fsep); Date d = new Date(status.getModificationTime()); sb.append(formatter.format(d)); sb.append(fsep); sb.append(status.getPath().toUri().getPath()); lsco.add(sb.toString()); } } }
From source file:org.godhuli.rhipe.RHMapFileOutputFormat.java
License:Apache License
/** Open the output generated by this format. */ public static MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException, FileNotFoundException { FileSystem fs = FileSystem.get(conf); // Path[] names = FileUtil.stat2Paths(fs.listStatus(dir)); FileStatus[] srcs = fs.globStatus(dir); if (srcs == null || srcs.length == 0) { throw new FileNotFoundException("Cannot access " + dir + ": No such file or directory."); }//from ww w . ja v a 2 s. co m // if(srcs.length==1 && srcs[0].isDir()) // srcs = fs.listStatus(srcs[0].getPath()); Path[] names = new Path[srcs.length]; for (int i = 0; i < names.length; i++) { names[i] = srcs[i].getPath(); } // sort names, so that hash partitioning works Arrays.sort(names); MapFile.Reader[] parts = new MapFile.Reader[names.length]; for (int i = 0; i < names.length; i++) { parts[i] = new MapFile.Reader(fs, names[i].toString(), conf); } return parts; }
From source file:org.kiji.mapreduce.kvstore.FileKeyValueArrayStore.java
License:Apache License
/** {@inheritDoc} */ @Override// w w w .jav a 2 s. c o m public void initFromConf(KeyValueStoreConfiguration conf) throws IOException { setConf(conf.getDelegate()); mDCachePrefix = conf.get(CONF_DCACHE_PREFIX, ""); LOG.debug("Input dCachePrefix: " + mDCachePrefix); mMaxValues = conf.getLong(CONF_MAX_VALUES, Long.MAX_VALUE); if (mDCachePrefix.isEmpty()) { // Read an ordinary list of files from the Configuration. // These may include directories and globs to expand. mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS, new String[0])), new Lists.Func<String, Path>() { @Override public Path eval(String in) { LOG.debug("File input: " + in); return new Path(in); } }); } else { // Use the dcache prefix to get the names of the files for this store. // The symlinks are already present in the working dir of the task. final FileSystem localFs = FileSystem.getLocal(conf.getDelegate()); FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*")); if (null == statuses || statuses.length == 0) { throw new IOException("No files associated with the job in the DistributedCache"); } // Get the (absolute) input file paths to use. mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() { @Override public Path eval(FileStatus status) { Path out = status.getPath().makeQualified(localFs); LOG.debug("Loaded from DistributedCache: " + out); return out; } }); } }
From source file:org.kiji.mapreduce.kvstore.lib.FileStoreHelper.java
License:Apache License
/** * Deserializes file- and DistributedCache-specific properties associated * with the KeyValueStore that owns this FileStoreHelper from the specified configuration. * * <p>This retains a reference to the KeyValueStoreConfiguration's backing Configuration * instance to use when opening files specified by this configuration.</p> * * @param conf the configuration to read. * @throws IOException if there's an error deserializing the configuration. *///from ww w . ja va 2s . co m public void initFromConf(KeyValueStoreConfiguration conf) throws IOException { setConf(conf.getDelegate()); mDCachePrefix = conf.get(CONF_DCACHE_PREFIX_KEY, ""); LOG.debug("Input dCachePrefix: " + mDCachePrefix); if (mDCachePrefix.isEmpty()) { // Read an ordinary list of files from the Configuration. // These may include directories and globs to expand. mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS_KEY, new String[0])), new Lists.Func<String, Path>() { @Override public Path eval(String in) { LOG.debug("File input: " + in); return new Path(in); } }); } else { // Use the dcache prefix to get the names of the files for this store. // The symlinks are already present in the working dir of the task. final FileSystem localFs = FileSystem.getLocal(conf.getDelegate()); FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*")); if (null == statuses || statuses.length == 0) { throw new IOException("No files associated with the job in the DistributedCache"); } // Get the (absolute) input file paths to use. mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() { @Override public Path eval(FileStatus status) { Path out = status.getPath().makeQualified(localFs); LOG.debug("Loaded from DistributedCache: " + out); return out; } }); } // If we are initializing a client-side instance to later serialize, the user may have // specified HDFS files, but also an intent to put the files in the DistributedCache. Set // this flag now, which will generate mDCachePrefix when addToConfiguration() is called // later. mUseDCache = conf.getBoolean(CONF_USE_DCACHE_KEY, USE_DCACHE_DEFAULT); }
From source file:org.mitre.mapred.fs.FileUtils.java
License:Open Source License
/** * Get a listing of all files that match the file pattern <i>srcf</i>. * <P>Example: "part-*" should return all the parts in lex order</P> * /* w w w. j a v a2 s. com*/ * @param srcf a file pattern specifying source files * @throws IOException * @see org.apache.hadoop.fs.FileSystem#globStatus(Path) * @see org.apache.hadoop.fs.FsShell */ public static synchronized Path[] ls(JobConf conf, String srcF) throws IOException { Path srcPath = new Path(srcF); FileSystem srcFs = srcPath.getFileSystem(conf); FileStatus[] srcs = srcFs.globStatus(srcPath); if (srcs == null || srcs.length == 0) { throw new FileNotFoundException("Cannot access " + srcPath.toString() + ": No such file or directory."); } Path[] srcP = new Path[srcs.length]; for (int i = 0; i < srcs.length; i++) { FileStatus stat = srcs[i]; srcP[i] = stat.getPath(); } return srcP; }
From source file:org.mrgeo.format.AutoFeatureInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Path[] paths = FileInputFormat.getInputPaths(context); List<InputSplit> result = new LinkedList<InputSplit>(); Configuration conf = context.getConfiguration(); // expand the wild cards and add up the total size by recursing. Vector<Path> expanded = new Vector<Path>(); long totalSize = 0; for (Path p : paths) { FileSystem fs = p.getFileSystem(conf); FileStatus[] status = fs.globStatus(p); for (FileStatus s : status) { totalSize += HadoopFileUtils.getPathSize(conf, s.getPath()); if (s.getPath() != null) { expanded.add(s.getPath()); }/*from w w w . j av a 2 s. c om*/ } } paths = expanded.toArray(paths); // create the individual splits String inputs = conf.get("mapred.input.dir", ""); int totalSplits = conf.getInt("mapred.map.tasks", 2); for (Path p : paths) { double portion = HadoopFileUtils.getPathSize(conf, p) / totalSize; int splits = (int) Math.max(1, Math.round(portion * totalSplits)); conf.setInt("mapred.map.tasks", splits); conf.set("mapred.input.dir", p.toString()); InputFormat<LongWritable, Geometry> f = FeatureInputFormatFactory.getInstance() .createInputFormat(p.toString()); for (InputSplit s : f.getSplits(context)) { AutoInputSplit ais = new AutoInputSplit(conf, s, f); result.add(ais); } } conf.setInt("mapred.map.tasks", totalSplits); conf.set("mapred.input.dir", inputs); return result; }
From source file:org.mrgeo.format.CsvLineInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { List<InputSplit> splits = new LinkedList<InputSplit>(); Path[] paths = FileInputFormat.getInputPaths(context); Configuration conf = context.getConfiguration(); long recordCtr = 0; for (Path path : paths) { FileSystem fs = path.getFileSystem(context.getConfiguration()); FileStatus[] status = fs.globStatus(path); for (FileStatus s : status) { Path fileName = s.getPath(); LineReader lr = null;/*from ww w. j a v a2 s. c om*/ InputStream in = null; try { in = HadoopFileUtils.open(conf, fileName); // fs.open(fileName); lr = new LineReader(in, conf); Text line = new Text(); long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { recordCtr++; length += num; if (recordCtr == recordsPerSplit) { splits.add(new FileSplit(fileName, begin, length, new String[] {})); begin += length; length = 0; recordCtr = 0; } } //file size smaller than min split size or the last chunk of records was smaller than //the split size if (length != 0) { splits.add(new FileSplit(fileName, begin, length, new String[] {})); } } finally { if (lr != null) { lr.close(); } if (in != null) { in.close(); } } } } return splits; }
From source file:org.mrgeo.format.CsvLineInputFormat.java
License:Apache License
@Override public long getRecordCount(JobContext context) throws IOException { Path[] paths = FileInputFormat.getInputPaths(context); Configuration conf = context.getConfiguration(); //get the total number of records in all files long recordCount = 0; for (Path p : paths) { FileSystem fs = p.getFileSystem(context.getConfiguration()); FileStatus[] status = fs.globStatus(p); for (FileStatus s : status) { Path fileName = s.getPath(); if (s.isDir()) { throw new IOException("Not a file: " + fileName); }/*from ww w . j av a 2 s. c o m*/ LineReader lr = null; InputStream in = null; try { in = HadoopFileUtils.open(conf, fileName); // fs.open(fileName); lr = new LineReader(in, conf); Text line = new Text(); while ((lr.readLine(line)) > 0) { recordCount++; } } finally { if (lr != null) { lr.close(); } if (in != null) { in.close(); } } } } llog.debug("recordCount = " + String.valueOf(recordCount)); return recordCount; }
From source file:org.mrgeo.test.MapOpTestVectorUtils.java
License:Apache License
public List readVectorOutputAsText(final Configuration conf, final Path vectorPath) throws IOException { // read in the output file final FileSystem fs = HadoopFileUtils.getFileSystem(conf, vectorPath); ArrayList results = new ArrayList(); if (fs.isFile(vectorPath)) { final FSDataInputStream fdis = fs.open(vectorPath); final BufferedReader br = new BufferedReader(new InputStreamReader(fdis)); try {/*from w w w . ja va2 s . c o m*/ String line = br.readLine(); while (line != null) { results.add(line); line = br.readLine(); } } finally { br.close(); if (fdis != null) { fdis.close(); } } } else { Path srcVector = new Path(vectorPath, "part*"); FileStatus files[] = fs.globStatus(srcVector); for (FileStatus fileStat : files) { final FSDataInputStream fdis = fs.open(fileStat.getPath()); final BufferedReader br = new BufferedReader(new InputStreamReader(fdis)); try { String line = br.readLine(); while (line != null) { results.add(line); line = br.readLine(); } } finally { br.close(); if (fdis != null) { fdis.close(); } } } } return results; }