Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.godhuli.rhipe.FileUtils.java

License:Apache License

public void copyMain(String src, String dest) throws IOException {
    File dst = new File(dest);
    Path srcpath = new Path(src);
    FileSystem srcFS = FileSystem.get(cfg);
    FileStatus[] srcs = srcFS.globStatus(srcpath);
    boolean dstIsDir = dst.isDirectory();
    if (srcs.length > 1 && !dstIsDir) {
        throw new IOException("When copying multiple files, " + "destination should be a directory.");
    }//from  w  w  w. j  a  v  a  2 s  .  c o m
    for (FileStatus status : srcs) {
        Path p = status.getPath();
        File f = dstIsDir ? new File(dst, p.getName()) : dst;
        copyToLocal(srcFS, p, f);
    }
}

From source file:org.godhuli.rhipe.FileUtils.java

License:Apache License

private void ls__(String path, ArrayList<String> lsco, boolean dorecurse)
        throws IOException, FileNotFoundException, URISyntaxException {

    Path spath = null;/*from ww  w . j a va2  s  . co  m*/
    spath = new Path(path);
    FileSystem srcFS = spath.getFileSystem(getConf());
    FileStatus[] srcs;
    srcs = srcFS.globStatus(spath);
    if (srcs == null || srcs.length == 0) {
        throw new FileNotFoundException("Cannot access " + path + ": No such file or directory.");
    }
    if (srcs.length == 1 && srcs[0].isDir())
        srcs = srcFS.listStatus(srcs[0].getPath());
    Calendar c = Calendar.getInstance();
    for (FileStatus status : srcs) {
        StringBuilder sb = new StringBuilder();
        boolean idir = status.isDir();
        String x = idir ? "d" : "-";
        if (dorecurse && idir)
            ls__(status.getPath().toUri().getPath(), lsco, dorecurse);
        else {
            sb.append(x);
            sb.append(status.getPermission().toString());
            sb.append(fsep);

            sb.append(status.getOwner());
            sb.append(fsep);

            sb.append(status.getGroup());
            sb.append(fsep);

            sb.append(status.getLen());
            sb.append(fsep);

            Date d = new Date(status.getModificationTime());
            sb.append(formatter.format(d));
            sb.append(fsep);

            sb.append(status.getPath().toUri().getPath());
            lsco.add(sb.toString());
        }
    }
}

From source file:org.godhuli.rhipe.RHMapFileOutputFormat.java

License:Apache License

/** Open the output generated by this format. */
public static MapFile.Reader[] getReaders(Path dir, Configuration conf)
        throws IOException, FileNotFoundException {
    FileSystem fs = FileSystem.get(conf);
    // Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));
    FileStatus[] srcs = fs.globStatus(dir);
    if (srcs == null || srcs.length == 0) {
        throw new FileNotFoundException("Cannot access " + dir + ": No such file or directory.");
    }//from   ww w . ja  v a 2 s.  co  m
    // if(srcs.length==1 && srcs[0].isDir())
    //     srcs = fs.listStatus(srcs[0].getPath());

    Path[] names = new Path[srcs.length];
    for (int i = 0; i < names.length; i++) {
        names[i] = srcs[i].getPath();
    }
    // sort names, so that hash partitioning works
    Arrays.sort(names);

    MapFile.Reader[] parts = new MapFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
    }
    return parts;
}

From source file:org.kiji.mapreduce.kvstore.FileKeyValueArrayStore.java

License:Apache License

/** {@inheritDoc} */
@Override//  w w  w  .jav a  2 s.  c o  m
public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {
    setConf(conf.getDelegate());
    mDCachePrefix = conf.get(CONF_DCACHE_PREFIX, "");
    LOG.debug("Input dCachePrefix: " + mDCachePrefix);
    mMaxValues = conf.getLong(CONF_MAX_VALUES, Long.MAX_VALUE);

    if (mDCachePrefix.isEmpty()) {
        // Read an ordinary list of files from the Configuration.
        // These may include directories and globs to expand.
        mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS, new String[0])),
                new Lists.Func<String, Path>() {
                    @Override
                    public Path eval(String in) {
                        LOG.debug("File input: " + in);
                        return new Path(in);
                    }
                });
    } else {
        // Use the dcache prefix to get the names of the files for this store.
        // The symlinks are already present in the working dir of the task.
        final FileSystem localFs = FileSystem.getLocal(conf.getDelegate());
        FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*"));
        if (null == statuses || statuses.length == 0) {
            throw new IOException("No files associated with the job in the DistributedCache");
        }

        // Get the (absolute) input file paths to use.
        mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() {
            @Override
            public Path eval(FileStatus status) {
                Path out = status.getPath().makeQualified(localFs);
                LOG.debug("Loaded from DistributedCache: " + out);
                return out;
            }
        });
    }
}

From source file:org.kiji.mapreduce.kvstore.lib.FileStoreHelper.java

License:Apache License

/**
 * Deserializes file- and DistributedCache-specific properties associated
 * with the KeyValueStore that owns this FileStoreHelper from the specified configuration.
 *
 * <p>This retains a reference to the KeyValueStoreConfiguration's backing Configuration
 * instance to use when opening files specified by this configuration.</p>
 *
 * @param conf the configuration to read.
 * @throws IOException if there's an error deserializing the configuration.
 *///from   ww w  . ja va  2s  .  co m
public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {
    setConf(conf.getDelegate());
    mDCachePrefix = conf.get(CONF_DCACHE_PREFIX_KEY, "");
    LOG.debug("Input dCachePrefix: " + mDCachePrefix);
    if (mDCachePrefix.isEmpty()) {
        // Read an ordinary list of files from the Configuration.
        // These may include directories and globs to expand.
        mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS_KEY, new String[0])),
                new Lists.Func<String, Path>() {
                    @Override
                    public Path eval(String in) {
                        LOG.debug("File input: " + in);
                        return new Path(in);
                    }
                });
    } else {
        // Use the dcache prefix to get the names of the files for this store.
        // The symlinks are already present in the working dir of the task.
        final FileSystem localFs = FileSystem.getLocal(conf.getDelegate());
        FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*"));
        if (null == statuses || statuses.length == 0) {
            throw new IOException("No files associated with the job in the DistributedCache");
        }

        // Get the (absolute) input file paths to use.
        mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() {
            @Override
            public Path eval(FileStatus status) {
                Path out = status.getPath().makeQualified(localFs);
                LOG.debug("Loaded from DistributedCache: " + out);
                return out;
            }
        });
    }

    // If we are initializing a client-side instance to later serialize, the user may have
    // specified HDFS files, but also an intent to put the files in the DistributedCache. Set
    // this flag now, which will generate mDCachePrefix when addToConfiguration() is called
    // later.
    mUseDCache = conf.getBoolean(CONF_USE_DCACHE_KEY, USE_DCACHE_DEFAULT);
}

From source file:org.mitre.mapred.fs.FileUtils.java

License:Open Source License

/**
 * Get a listing of all files that match the file pattern <i>srcf</i>.
 * <P>Example: "part-*" should return all the parts in lex order</P>
 * /*  w  w  w. j  a  v  a2  s.  com*/
 * @param srcf a file pattern specifying source files
 * @throws IOException
 * @see org.apache.hadoop.fs.FileSystem#globStatus(Path)
 * @see org.apache.hadoop.fs.FsShell
 */
public static synchronized Path[] ls(JobConf conf, String srcF) throws IOException {
    Path srcPath = new Path(srcF);
    FileSystem srcFs = srcPath.getFileSystem(conf);
    FileStatus[] srcs = srcFs.globStatus(srcPath);
    if (srcs == null || srcs.length == 0) {
        throw new FileNotFoundException("Cannot access " + srcPath.toString() + ": No such file or directory.");
    }

    Path[] srcP = new Path[srcs.length];
    for (int i = 0; i < srcs.length; i++) {
        FileStatus stat = srcs[i];
        srcP[i] = stat.getPath();
    }
    return srcP;
}

From source file:org.mrgeo.format.AutoFeatureInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Path[] paths = FileInputFormat.getInputPaths(context);

    List<InputSplit> result = new LinkedList<InputSplit>();
    Configuration conf = context.getConfiguration();

    // expand the wild cards and add up the total size by recursing.
    Vector<Path> expanded = new Vector<Path>();
    long totalSize = 0;
    for (Path p : paths) {
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] status = fs.globStatus(p);
        for (FileStatus s : status) {
            totalSize += HadoopFileUtils.getPathSize(conf, s.getPath());
            if (s.getPath() != null) {
                expanded.add(s.getPath());
            }/*from   w w  w . j  av  a 2  s.  c om*/
        }
    }
    paths = expanded.toArray(paths);

    // create the individual splits
    String inputs = conf.get("mapred.input.dir", "");
    int totalSplits = conf.getInt("mapred.map.tasks", 2);

    for (Path p : paths) {
        double portion = HadoopFileUtils.getPathSize(conf, p) / totalSize;
        int splits = (int) Math.max(1, Math.round(portion * totalSplits));
        conf.setInt("mapred.map.tasks", splits);
        conf.set("mapred.input.dir", p.toString());
        InputFormat<LongWritable, Geometry> f = FeatureInputFormatFactory.getInstance()
                .createInputFormat(p.toString());
        for (InputSplit s : f.getSplits(context)) {
            AutoInputSplit ais = new AutoInputSplit(conf, s, f);
            result.add(ais);
        }
    }
    conf.setInt("mapred.map.tasks", totalSplits);
    conf.set("mapred.input.dir", inputs);

    return result;
}

From source file:org.mrgeo.format.CsvLineInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    List<InputSplit> splits = new LinkedList<InputSplit>();
    Path[] paths = FileInputFormat.getInputPaths(context);
    Configuration conf = context.getConfiguration();

    long recordCtr = 0;
    for (Path path : paths) {
        FileSystem fs = path.getFileSystem(context.getConfiguration());
        FileStatus[] status = fs.globStatus(path);
        for (FileStatus s : status) {
            Path fileName = s.getPath();
            LineReader lr = null;/*from   ww  w.  j  a v a2  s. c om*/
            InputStream in = null;
            try {
                in = HadoopFileUtils.open(conf, fileName); // fs.open(fileName);
                lr = new LineReader(in, conf);
                Text line = new Text();
                long begin = 0;
                long length = 0;
                int num = -1;
                while ((num = lr.readLine(line)) > 0) {
                    recordCtr++;
                    length += num;
                    if (recordCtr == recordsPerSplit) {
                        splits.add(new FileSplit(fileName, begin, length, new String[] {}));
                        begin += length;
                        length = 0;
                        recordCtr = 0;
                    }
                }
                //file size smaller than min split size or the last chunk of records was smaller than
                //the split size
                if (length != 0) {
                    splits.add(new FileSplit(fileName, begin, length, new String[] {}));
                }
            } finally {
                if (lr != null) {
                    lr.close();
                }
                if (in != null) {
                    in.close();
                }
            }
        }
    }

    return splits;
}

From source file:org.mrgeo.format.CsvLineInputFormat.java

License:Apache License

@Override
public long getRecordCount(JobContext context) throws IOException {
    Path[] paths = FileInputFormat.getInputPaths(context);
    Configuration conf = context.getConfiguration();

    //get the total number of records in all files
    long recordCount = 0;
    for (Path p : paths) {
        FileSystem fs = p.getFileSystem(context.getConfiguration());
        FileStatus[] status = fs.globStatus(p);
        for (FileStatus s : status) {
            Path fileName = s.getPath();
            if (s.isDir()) {
                throw new IOException("Not a file: " + fileName);
            }/*from ww w .  j  av  a 2  s. c o  m*/
            LineReader lr = null;
            InputStream in = null;
            try {
                in = HadoopFileUtils.open(conf, fileName); // fs.open(fileName);
                lr = new LineReader(in, conf);
                Text line = new Text();
                while ((lr.readLine(line)) > 0) {
                    recordCount++;
                }
            } finally {
                if (lr != null) {
                    lr.close();
                }
                if (in != null) {
                    in.close();
                }
            }
        }
    }
    llog.debug("recordCount = " + String.valueOf(recordCount));
    return recordCount;
}

From source file:org.mrgeo.test.MapOpTestVectorUtils.java

License:Apache License

public List readVectorOutputAsText(final Configuration conf, final Path vectorPath) throws IOException {
    // read in the output file
    final FileSystem fs = HadoopFileUtils.getFileSystem(conf, vectorPath);

    ArrayList results = new ArrayList();
    if (fs.isFile(vectorPath)) {
        final FSDataInputStream fdis = fs.open(vectorPath);
        final BufferedReader br = new BufferedReader(new InputStreamReader(fdis));
        try {/*from  w  w  w . ja  va2 s  . c  o m*/
            String line = br.readLine();
            while (line != null) {
                results.add(line);
                line = br.readLine();
            }
        } finally {
            br.close();
            if (fdis != null) {
                fdis.close();
            }
        }
    } else {
        Path srcVector = new Path(vectorPath, "part*");
        FileStatus files[] = fs.globStatus(srcVector);
        for (FileStatus fileStat : files) {
            final FSDataInputStream fdis = fs.open(fileStat.getPath());
            final BufferedReader br = new BufferedReader(new InputStreamReader(fdis));
            try {
                String line = br.readLine();
                while (line != null) {
                    results.add(line);
                    line = br.readLine();
                }
            } finally {
                br.close();
                if (fdis != null) {
                    fdis.close();
                }
            }
        }
    }
    return results;
}