Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.godhuli.rhipe.FileUtils.java

License:Apache License

public void copyMain(String src, String dest) throws IOException {
    File dst = new File(dest);
    Path srcpath = new Path(src);
    FileSystem srcFS = FileSystem.get(cfg);
    FileStatus[] srcs = srcFS.globStatus(srcpath);
    boolean dstIsDir = dst.isDirectory();
    if (srcs.length > 1 && !dstIsDir) {
        throw new IOException("When copying multiple files, " + "destination should be a directory.");
    }//from  w  w  w. j  a  v  a  2 s  .  c o m
    for (FileStatus status : srcs) {
        Path p = status.getPath();
        File f = dstIsDir ? new File(dst, p.getName()) : dst;
        copyToLocal(srcFS, p, f);
    }
}

From source file:org.godhuli.rhipe.FileUtils.java

License:Apache License

private void ls__(String path, ArrayList<String> lsco, boolean dorecurse)
        throws IOException, FileNotFoundException, URISyntaxException {

    Path spath = null;/*from ww  w . j a va2  s  . co  m*/
    spath = new Path(path);
    FileSystem srcFS = spath.getFileSystem(getConf());
    FileStatus[] srcs;
    srcs = srcFS.globStatus(spath);
    if (srcs == null || srcs.length == 0) {
        throw new FileNotFoundException("Cannot access " + path + ": No such file or directory.");
    }
    if (srcs.length == 1 && srcs[0].isDir())
        srcs = srcFS.listStatus(srcs[0].getPath());
    Calendar c = Calendar.getInstance();
    for (FileStatus status : srcs) {
        StringBuilder sb = new StringBuilder();
        boolean idir = status.isDir();
        String x = idir ? "d" : "-";
        if (dorecurse && idir)
            ls__(status.getPath().toUri().getPath(), lsco, dorecurse);
        else {
            sb.append(x);
            sb.append(status.getPermission().toString());
            sb.append(fsep);

            sb.append(status.getOwner());
            sb.append(fsep);

            sb.append(status.getGroup());
            sb.append(fsep);

            sb.append(status.getLen());
            sb.append(fsep);

            Date d = new Date(status.getModificationTime());
            sb.append(formatter.format(d));
            sb.append(fsep);

            sb.append(status.getPath().toUri().getPath());
            lsco.add(sb.toString());
        }
    }
}

From source file:org.godhuli.rhipe.RHMapFileOutputFormat.java

License:Apache License

/** Open the output generated by this format. */
public static MapFile.Reader[] getReaders(Path dir, Configuration conf)
        throws IOException, FileNotFoundException {
    FileSystem fs = FileSystem.get(conf);
    // Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));
    FileStatus[] srcs = fs.globStatus(dir);
    if (srcs == null || srcs.length == 0) {
        throw new FileNotFoundException("Cannot access " + dir + ": No such file or directory.");
    }//from   ww w . ja  v a 2 s.  co  m
    // if(srcs.length==1 && srcs[0].isDir())
    //     srcs = fs.listStatus(srcs[0].getPath());

    Path[] names = new Path[srcs.length];
    for (int i = 0; i < names.length; i++) {
        names[i] = srcs[i].getPath();
    }
    // sort names, so that hash partitioning works
    Arrays.sort(names);

    MapFile.Reader[] parts = new MapFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
    }
    return parts;
}

From source file:org.kiji.mapreduce.kvstore.FileKeyValueArrayStore.java

License:Apache License

/** {@inheritDoc} */
@Override//  w w  w  .jav a  2 s.  c o  m
public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {
    setConf(conf.getDelegate());
    mDCachePrefix = conf.get(CONF_DCACHE_PREFIX, "");
    LOG.debug("Input dCachePrefix: " + mDCachePrefix);
    mMaxValues = conf.getLong(CONF_MAX_VALUES, Long.MAX_VALUE);

    if (mDCachePrefix.isEmpty()) {
        // Read an ordinary list of files from the Configuration.
        // These may include directories and globs to expand.
        mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS, new String[0])),
                new Lists.Func<String, Path>() {
                    @Override
                    public Path eval(String in) {
                        LOG.debug("File input: " + in);
                        return new Path(in);
                    }
                });
    } else {
        // Use the dcache prefix to get the names of the files for this store.
        // The symlinks are already present in the working dir of the task.
        final FileSystem localFs = FileSystem.getLocal(conf.getDelegate());
        FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*"));
        if (null == statuses || statuses.length == 0) {
            throw new IOException("No files associated with the job in the DistributedCache");
        }

        // Get the (absolute) input file paths to use.
        mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() {
            @Override
            public Path eval(FileStatus status) {
                Path out = status.getPath().makeQualified(localFs);
                LOG.debug("Loaded from DistributedCache: " + out);
                return out;
            }
        });
    }
}

From source file:org.kiji.mapreduce.kvstore.lib.FileStoreHelper.java

License:Apache License

/**
 * Deserializes file- and DistributedCache-specific properties associated
 * with the KeyValueStore that owns this FileStoreHelper from the specified configuration.
 *
 * <p>This retains a reference to the KeyValueStoreConfiguration's backing Configuration
 * instance to use when opening files specified by this configuration.</p>
 *
 * @param conf the configuration to read.
 * @throws IOException if there's an error deserializing the configuration.
 *///from   ww w  . ja va  2s  .  co m
public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {
    setConf(conf.getDelegate());
    mDCachePrefix = conf.get(CONF_DCACHE_PREFIX_KEY, "");
    LOG.debug("Input dCachePrefix: " + mDCachePrefix);
    if (mDCachePrefix.isEmpty()) {
        // Read an ordinary list of files from the Configuration.
        // These may include directories and globs to expand.
        mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS_KEY, new String[0])),
                new Lists.Func<String, Path>() {
                    @Override
                    public Path eval(String in) {
                        LOG.debug("File input: " + in);
                        return new Path(in);
                    }
                });
    } else {
        // Use the dcache prefix to get the names of the files for this store.
        // The symlinks are already present in the working dir of the task.
        final FileSystem localFs = FileSystem.getLocal(conf.getDelegate());
        FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*"));
        if (null == statuses || statuses.length == 0) {
            throw new IOException("No files associated with the job in the DistributedCache");
        }

        // Get the (absolute) input file paths to use.
        mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() {
            @Override
            public Path eval(FileStatus status) {
                Path out = status.getPath().makeQualified(localFs);
                LOG.debug("Loaded from DistributedCache: " + out);
                return out;
            }
        });
    }

    // If we are initializing a client-side instance to later serialize, the user may have
    // specified HDFS files, but also an intent to put the files in the DistributedCache. Set
    // this flag now, which will generate mDCachePrefix when addToConfiguration() is called
    // later.
    mUseDCache = conf.getBoolean(CONF_USE_DCACHE_KEY, USE_DCACHE_DEFAULT);
}

From source file:org.mitre.mapred.fs.FileUtils.java

License:Open Source License

/**
 * Get a listing of all files that match the file pattern <i>srcf</i>.
 * <P>Example: "part-*" should return all the parts in lex order</P>
 * /*  w  w  w. j  a  v  a2  s.  com*/
 * @param srcf a file pattern specifying source files
 * @throws IOException
 * @see org.apache.hadoop.fs.FileSystem#globStatus(Path)
 * @see org.apache.hadoop.fs.FsShell
 */
public static synchronized Path[] ls(JobConf conf, String srcF) throws IOException {
    Path srcPath = new Path(srcF);
    FileSystem srcFs = srcPath.getFileSystem(conf);
    FileStatus[] srcs = srcFs.globStatus(srcPath);
    if (srcs == null || srcs.length == 0) {
        throw new FileNotFoundException("Cannot access " + srcPath.toString() + ": No such file or directory.");
    }

    Path[] srcP = new Path[srcs.length];
    for (int i = 0; i < srcs.length; i++) {
        FileStatus stat = srcs[i];
        srcP[i] = stat.getPath();
    }
    return srcP;
}

From source file:org.mrgeo.format.AutoFeatureInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Path[] paths = FileInputFormat.getInputPaths(context);

    List<InputSplit> result = new LinkedList<InputSplit>();
    Configuration conf = context.getConfiguration();

    // expand the wild cards and add up the total size by recursing.
    Vector<Path> expanded = new Vector<Path>();
    long totalSize = 0;
    for (Path p : paths) {
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] status = fs.globStatus(p);
        for (FileStatus s : status) {
            totalSize += HadoopFileUtils.getPathSize(conf, s.getPath());
            if (s.getPath() != null) {
                expanded.add(s.getPath());
            }/*from   w w  w . j  av  a 2  s.  c om*/
        }
    }
    paths = expanded.toArray(paths);

    // create the individual splits
    String inputs = conf.get("mapred.input.dir", "");
    int totalSplits = conf.getInt("mapred.map.tasks", 2);

    for (Path p : paths) {
        double portion = HadoopFileUtils.getPathSize(conf, p) / totalSize;
        int splits = (int) Math.max(1, Math.round(portion * totalSplits));
        conf.setInt("mapred.map.tasks", splits);
        conf.set("mapred.input.dir", p.toString());
        InputFormat<LongWritable, Geometry> f = FeatureInputFormatFactory.getInstance()
                .createInputFormat(p.toString());
        for (InputSplit s : f.getSplits(context)) {
            AutoInputSplit ais = new AutoInputSplit(conf, s, f);
            result.add(ais);
        }
    }
    conf.setInt("mapred.map.tasks", totalSplits);
    conf.set("mapred.input.dir", inputs);

    return result;
}

From source file:org.mrgeo.format.CsvLineInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    List<InputSplit> splits = new LinkedList<InputSplit>();
    Path[] paths = FileInputFormat.getInputPaths(context);
    Configuration conf = context.getConfiguration();

    long recordCtr = 0;
    for (Path path : paths) {
        FileSystem fs = path.getFileSystem(context.getConfiguration());
        FileStatus[] status = fs.globStatus(path);
        for (FileStatus s : status) {
            Path fileName = s.getPath();
            LineReader lr = null;/*from   ww  w.  j  a v a2  s. c om*/
            InputStream in = null;
            try {
                in = HadoopFileUtils.open(conf, fileName); // fs.open(fileName);
                lr = new LineReader(in, conf);
                Text line = new Text();
                long begin = 0;
                long length = 0;
                int num = -1;
                while ((num = lr.readLine(line)) > 0) {
                    recordCtr++;
                    length += num;
                    if (recordCtr == recordsPerSplit) {
                        splits.add(new FileSplit(fileName, begin, length, new String[] {}));
                        begin += length;
                        length = 0;
                        recordCtr = 0;
                    }
                }
                //file size smaller than min split size or the last chunk of records was smaller than
                //the split size
                if (length != 0) {
                    splits.add(new FileSplit(fileName, begin, length, new String[] {}));
                }
            } finally {
                if (lr != null) {
                    lr.close();
                }
                if (in != null) {
                    in.close();
                }
            }
        }
    }

    return splits;
}

From source file:org.mrgeo.format.CsvLineInputFormat.java

License:Apache License

@Override
public long getRecordCount(JobContext context) throws IOException {
    Path[] paths = FileInputFormat.getInputPaths(context);
    Configuration conf = context.getConfiguration();

    //get the total number of records in all files
    long recordCount = 0;
    for (Path p : paths) {
        FileSystem fs = p.getFileSystem(context.getConfiguration());
        FileStatus[] status = fs.globStatus(p);
        for (FileStatus s : status) {
            Path fileName = s.getPath();
            if (s.isDir()) {
                throw new IOException("Not a file: " + fileName);
            }/*from ww w .  j  av  a 2  s. c o  m*/
            LineReader lr = null;
            InputStream in = null;
            try {
                in = HadoopFileUtils.open(conf, fileName); // fs.open(fileName);
                lr = new LineReader(in, conf);
                Text line = new Text();
                while ((lr.readLine(line)) > 0) {
                    recordCount++;
                }
            } finally {
                if (lr != null) {
                    lr.close();
                }
                if (in != null) {
                    in.close();
                }
            }
        }
    }
    llog.debug("recordCount = " + String.valueOf(recordCount));
    return recordCount;
}

From source file:org.mrgeo.test.MapOpTestVectorUtils.java

License:Apache License

public List readVectorOutputAsText(final Configuration conf, final Path vectorPath) throws IOException {
    // read in the output file
    final FileSystem fs = HadoopFileUtils.getFileSystem(conf, vectorPath);

    ArrayList results = new ArrayList();
    if (fs.isFile(vectorPath)) {
        final FSDataInputStream fdis = fs.open(vectorPath);
        final BufferedReader br = new BufferedReader(new InputStreamReader(fdis));
        try {/*from  w  w  w . ja  va2 s  . c  o m*/
            String line = br.readLine();
            while (line != null) {
                results.add(line);
                line = br.readLine();
            }
        } finally {
            br.close();
            if (fdis != null) {
                fdis.close();
            }
        }
    } else {
        Path srcVector = new Path(vectorPath, "part*");
        FileStatus files[] = fs.globStatus(srcVector);
        for (FileStatus fileStat : files) {
            final FSDataInputStream fdis = fs.open(fileStat.getPath());
            final BufferedReader br = new BufferedReader(new InputStreamReader(fdis));
            try {
                String line = br.readLine();
                while (line != null) {
                    results.add(line);
                    line = br.readLine();
                }
            } finally {
                br.close();
                if (fdis != null) {
                    fdis.close();
                }
            }
        }
    }
    return results;
}