Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:io.warp10.continuum.store.HFileStats.java

License:Apache License

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    CacheConfig cacheConf = new CacheConfig(conf);

    FileSystem fs = FileSystem.newInstance(conf);

    FileStatus[] pathes = fs.globStatus(new Path(args[0]));

    long bytes = 0L;
    long cells = 0L;

    for (FileStatus status : pathes) {
        try {/*from ww  w .  ja v  a 2  s  .  c o m*/
            HFile.Reader reader = HFile.createReader(fs, status.getPath(), cacheConf, conf);
            bytes += reader.length();
            cells += reader.getEntries();

            System.out.println(
                    status.getPath() + " >>> " + reader.length() + " bytes " + reader.getEntries() + " cells");

            reader.close();
        } catch (Exception e) {
            continue;
        }
    }

    System.out.println(
            "TOTAL: " + cells + " cells " + bytes + " bytes " + (bytes / (double) cells) + " bytes/cell");

    long ts = System.currentTimeMillis();

    System.out.println(ts * 1000 + "// hbase.bytes{} " + bytes);
    System.out.println(ts * 1000 + "// hbase.datapoints{} " + cells);
}

From source file:it.crs4.seal.common.SealToolParser.java

License:Open Source License

/**
 * Parses command line.// w w  w .  j a  va2 s . co  m
 *
 * Override this method to implement additional command line options,
 * but do make sure you call this method to parse the default options.
 */
protected CommandLine parseOptions(Configuration conf, String[] args) throws ParseException, IOException {
    myconf = conf;

    setDefaultProperties(conf);

    // load settings from configuration file
    // first, parse the command line (in getRcFile) looking for an option overriding the default seal configuration file
    File configFile = getRcFile(args);
    if (configFile != null)
        loadConfig(conf, configFile);

    // now parse the entire command line using the default hadoop parser.  Now
    // the user can override properties specified in the config file with properties
    // specified on the command line.
    CommandLine line = new GenericOptionsParser(conf, options, args).getCommandLine();
    if (line == null)
        throw new ParseException("Error parsing command line"); // getCommandLine returns an null if there was a parsing error

    ////////////////////// input/output formats //////////////////////
    // set the configuration property.  Then, we'll check the property
    // to ensure it has a valid value, regardless of whether we just set it,
    // so that the check will also be valid if the property is set directly.
    if (line.hasOption(opt_inputFormat.getOpt()))
        myconf.set(INPUT_FORMAT_CONF, line.getOptionValue(opt_inputFormat.getOpt()));

    validateIOFormat(INPUT_FORMAT_CONF, acceptedInputFormats);

    if (line.hasOption(opt_outputFormat.getOpt()))
        myconf.set(OUTPUT_FORMAT_CONF, line.getOptionValue(opt_outputFormat.getOpt()));

    validateIOFormat(OUTPUT_FORMAT_CONF, acceptedOutputFormats);

    if (conf.get(INPUT_FORMAT_ENCODING) != null) {
        String value = conf.get(INPUT_FORMAT_ENCODING);
        if (value.equals("sanger") || value.equals("illumina"))
            conf.set(fi.tkk.ics.hadoop.bam.FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, value);
        else
            throw new ParseException("Invalid " + INPUT_FORMAT_ENCODING + ". Expected 'sanger' or 'illumina'");
    }

    /////////////////////// output compression /////////////////////
    if (line.hasOption(opt_compressOutput.getOpt())) {
        myconf.setBoolean("mapred.output.compress", true);
        String codec = line.getOptionValue(opt_compressOutput.getOpt());
        if (codec != null) {
            String codecClass = "org.apache.hadoop.io.compress.GzipCodec"; // default
            if ("auto".equalsIgnoreCase(codec) || "gzip".equalsIgnoreCase(codec)) {
                // pass.  Already set
            } else if ("bzip2".equalsIgnoreCase(codec))
                codecClass = "org.apache.hadoop.io.compress.BZip2Codec";
            else if ("snappy".equalsIgnoreCase(codec))
                codecClass = "org.apache.hadoop.io.compress.SnappyCodec";
            else {
                throw new ParseException("Unknown codec " + codec
                        + ". Valid values are gzip, bzip2, snappy and auto.\n"
                        + "If you want to use an unsupported codec pass 'auto' and set the property mapred.output.compression.codec directly");
            }

            myconf.set("mapred.output.compression.codec", codecClass);
        }
    }

    ////////////////////// number of reducers //////////////////////
    if (line.hasOption(opt_nReduceTasks.getOpt())) {
        String rString = line.getOptionValue(opt_nReduceTasks.getOpt());
        try {
            int r = Integer.parseInt(rString);
            if (r >= minReduceTasks)
                nReduceTasks = r;
            else
                throw new ParseException("Number of reducers must be greater than or equal to " + minReduceTasks
                        + " (got " + rString + ")");
        } catch (NumberFormatException e) {
            throw new ParseException("Invalid number of reduce tasks '" + rString + "'");
        }
    }

    ////////////////////// positional arguments //////////////////////
    String[] otherArgs = line.getArgs();
    if (otherArgs.length < 2) // require at least two:  one input and one output
        throw new ParseException("You must provide input and output paths");
    else {
        //
        FileSystem fs;
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            Path p = new Path(otherArgs[i]);
            fs = p.getFileSystem(conf);
            p = p.makeQualified(fs);
            FileStatus[] files = fs.globStatus(p);
            if (files != null && files.length > 0) {
                for (FileStatus status : files)
                    inputs.add(status.getPath());
            } else
                throw new ParseException("Input path " + p.toString() + " doesn't exist");
        }
        // now the last one, should be the output path
        outputDir = new Path(otherArgs[otherArgs.length - 1]);
        fs = outputDir.getFileSystem(conf);
        outputDir = outputDir.makeQualified(fs);
        if (fs.exists(outputDir))
            throw new ParseException(
                    "Output path " + outputDir.toString() + " already exists.  Won't overwrite");
    }

    return line;
}

From source file:ml.shifu.shifu.fs.ShifuFileUtils.java

License:Apache License

/**
 * According to SourceType to check whether file exists.
 *
 * @param path       - path of source file
 * @param sourceType - local/hdfs/*from w ww . j  a  va 2s  .com*/
 * @return - true if file exists, or false
 * @throws IOException -  if any I/O exception in processing
 */
public static boolean isFileExists(String path, SourceType sourceType) throws IOException {
    FileSystem fs = getFileSystemBySourceType(sourceType);
    FileStatus[] fileStatusArr = fs.globStatus(new Path(path));
    return !(fileStatusArr == null || fileStatusArr.length == 0);
}

From source file:ml.shifu.shifu.fs.ShifuFileUtils.java

License:Apache License

/**
 * Expand the file path, allowing user to use regex just like when using `hadoop fs`
 * According the rules in glob, "{2,3}", "*" will be allowed
 *
 * @param rawPath    - the raw file path that may contains regex
 * @param sourceType - file source [local/HDFS]
 * @return - the file path list after expansion
 * @throws IOException -  if any I/O exception in processing
 *///from  w w w. j a v a 2  s.  c  o m
public static List<String> expandPath(String rawPath, SourceType sourceType) throws IOException {
    FileSystem fs = getFileSystemBySourceType(sourceType);
    FileStatus[] fsArr = fs.globStatus(new Path(rawPath));

    List<String> filePathList = new ArrayList<String>();
    if (fsArr != null) {
        for (FileStatus fileStatus : fsArr) {
            filePathList.add(fileStatus.getPath().toString());
        }
    }

    return filePathList;
}

From source file:ml.shifu.shifu.util.CommonUtils.java

License:Apache License

/**
 * Find the model files for some @ModelConfig. There is a little tricky about this function.
 * If @EvalConfig is specified, try to load the models according setting in @EvalConfig,
 * or if @EvalConfig is null or ModelsPath is blank, Shifu will try to load models under `models`
 * directory/*from  w  w w.j ava2s . c  o m*/
 *
 * @param modelConfig - @ModelConfig, need this, since the model file may exist in HDFS
 * @param evalConfig  - @EvalConfig, maybe null
 * @param sourceType  - Where is file system
 * @return - @FileStatus array for all found models
 * @throws IOException
 */
public static List<FileStatus> findModels(ModelConfig modelConfig, EvalConfig evalConfig, SourceType sourceType)
        throws IOException {
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    PathFinder pathFinder = new PathFinder(modelConfig);

    // If the algorithm in ModelConfig is NN, we only load NN models
    // the same as SVM, LR
    String modelSuffix = "." + modelConfig.getAlgorithm().toLowerCase();

    List<FileStatus> fileList = new ArrayList<FileStatus>();
    if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) {
        Path path = new Path(pathFinder.getModelsPath(sourceType));
        fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix))));
    } else {
        String modelsPath = evalConfig.getModelsPath();
        FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath));
        if (ArrayUtils.isNotEmpty(expandedPaths)) {
            for (FileStatus epath : expandedPaths) {
                fileList.addAll(
                        Arrays.asList(fs.listStatus(epath.getPath(), new FileSuffixPathFilter(modelSuffix))));
            }
        }
    }

    return fileList;
}

From source file:ml.shifu.shifu.util.ModelSpecLoaderUtils.java

License:Apache License

/**
 * Find the model files for some @ModelConfig. There is a little tricky about this function.
 * If @EvalConfig is specified, try to load the models according setting in @EvalConfig,
 * or if {@link EvalConfig} is null or modelsPath is blank, Shifu will try to load models under `models`
 * directory// w  w w  . j  a  va  2  s.c  o m
 *
 * @param modelConfig
 *            - {@link ModelConfig}, need this, since the model file may exist in HDFS
 * @param evalConfig
 *            - {@link EvalConfig}, maybe null
 * @param sourceType
 *            - Where is file system
 * @return - {@link FileStatus} array for all found models
 * @throws IOException
 *             io exception to load files
 */
public static List<FileStatus> findModels(ModelConfig modelConfig, EvalConfig evalConfig, SourceType sourceType)
        throws IOException {
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    PathFinder pathFinder = new PathFinder(modelConfig);

    // If the algorithm in ModelConfig is NN, we only load NN models
    // the same as SVM, LR
    String modelSuffix = "." + modelConfig.getAlgorithm().toLowerCase();

    List<FileStatus> fileList = new ArrayList<>();
    if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) {
        Path path = new Path(pathFinder.getModelsPath(sourceType));
        fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix))));
    } else {
        String modelsPath = evalConfig.getModelsPath();
        FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath));
        if (ArrayUtils.isNotEmpty(expandedPaths)) {
            for (FileStatus fileStatus : expandedPaths) {
                fileList.addAll(Arrays.asList(fs.listStatus(fileStatus.getPath(), // list all files
                        new FileSuffixPathFilter(modelSuffix))));
            }
        }
    }

    return fileList;
}

From source file:ml.shifu.shifu.util.ModelSpecLoaderUtils.java

License:Apache License

/**
 * Load the generic model config and parse it to java object.
 * Similar as {@link #findModels(ModelConfig, EvalConfig, RawSourceData.SourceType)}
 * //from  w w w .  j a v  a2 s . c  o  m
 * @param modelConfig
 *            - {@link ModelConfig}, need this, since the model file may exist in HDFS
 * @param evalConfig
 *            - {@link EvalConfig}, maybe null
 * @param sourceType
 *            - {@link SourceType}, HDFS or Local?
 * @return the file status list for generic models
 * @throws IOException
 *             Exception occurred when finding generic models
 */
public static List<FileStatus> findGenericModels(ModelConfig modelConfig, EvalConfig evalConfig,
        RawSourceData.SourceType sourceType) throws IOException {
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    PathFinder pathFinder = new PathFinder(modelConfig);

    // Find generic model config file with suffix .json
    String modelSuffix = ".json";

    List<FileStatus> fileList = new ArrayList<>();
    if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) {
        Path path = new Path(pathFinder.getModelsPath(sourceType)); // modelsPath / <ModelName>
        // + File.separator + modelConfig.getBasic().getName());
        fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix))));
    } else {
        String modelsPath = evalConfig.getModelsPath();
        FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath)); // models / <ModelName>
        // + File.separator + modelConfig.getBasic().getName()));
        if (ArrayUtils.isNotEmpty(expandedPaths)) {
            for (FileStatus epath : expandedPaths) {
                fileList.addAll(Arrays.asList(fs.listStatus(epath.getPath(), // list all files with suffix
                        new FileSuffixPathFilter(modelSuffix))));
            }
        }
    }

    return fileList;
}

From source file:net.myrrix.batch.common.iterator.sequencefile.SequenceFileDirIterator.java

License:Apache License

/**
 * Constructor that uses either {@link FileSystem#listStatus(Path)} or
 * {@link FileSystem#globStatus(Path)} to obtain list of files to iterate over
 * (depending on pathType parameter).//from   w w w. j  a  v  a 2  s .  co m
 */
public SequenceFileDirIterator(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering,
        final boolean reuseKeyValueInstances, final Configuration conf) throws IOException {

    FileStatus[] statuses;
    FileSystem fs = path.getFileSystem(conf);
    if (filter == null) {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path);
    } else {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter);
    }

    if (statuses == null) {
        statuses = NO_STATUSES;
    } else {
        if (ordering == null) {
            // If order does not matter, use a random order
            Collections.shuffle(Arrays.asList(statuses));
        } else {
            Arrays.sort(statuses, ordering);
        }
    }

    closer = Closer.create();

    Iterator<Iterator<Pair<K, V>>> fsIterators = Iterators.transform(Iterators.forArray(statuses),
            new Function<FileStatus, Iterator<Pair<K, V>>>() {
                @Override
                public Iterator<Pair<K, V>> apply(FileStatus from) {
                    try {
                        SequenceFileIterator<K, V> iterator = new SequenceFileIterator<K, V>(from.getPath(),
                                reuseKeyValueInstances, conf);
                        closer.register(iterator);
                        return iterator;
                    } catch (IOException ioe) {
                        throw new IllegalStateException(from.getPath().toString(), ioe);
                    }
                }
            });

    delegate = Iterators.concat(fsIterators);
}

From source file:net.sf.katta.indexing.IndexerJob.java

License:Apache License

public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException {
    // create job conf with class pointing into job jar.
    JobConf jobConf = new JobConf(IndexerJob.class);
    jobConf.setJobName("indexer");
    jobConf.setMapRunnerClass(Indexer.class);
    // alternative use a text file and a TextInputFormat
    jobConf.setInputFormat(SequenceFileInputFormat.class);

    Path input = new Path(path);
    FileInputFormat.setInputPaths(jobConf, input);
    // we just set the output path to make hadoop happy.
    FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination));
    // setting the folder where lucene indexes will be copied when finished.
    jobConf.set("finalDestination", finalDestination);
    // important to switch spec exec off.
    // We dont want to have something duplicated.
    jobConf.setSpeculativeExecution(false);

    // The num of map tasks is equal to the num of input splits.
    // The num of input splits by default is equal to the num of hdf blocks
    // for the input file(s). To get the right num of shards we need to
    // calculate the best input split size.

    FileSystem fs = FileSystem.get(input.toUri(), jobConf);
    FileStatus[] status = fs.globStatus(input);
    long size = 0;
    for (FileStatus fileStatus : status) {
        size += fileStatus.getLen();//from  ww w  . j  a  v a2 s.co  m
    }
    long optimalSplisize = size / numOfShards;
    jobConf.set("mapred.min.split.size", "" + optimalSplisize);

    // give more mem to lucene tasks.
    jobConf.set("mapred.child.java.opts", "-Xmx2G");
    jobConf.setNumMapTasks(1);
    jobConf.setNumReduceTasks(0);
    JobClient.runJob(jobConf);
}

From source file:nl.surfsara.newsreader.loader.ReadNewsreaderDocs.java

License:Apache License

@Override
public Long run() {
    long numfilesread = 0;
    Path sPath = new Path(source);
    File destDir = new File(dest);
    if (destDir.isDirectory()) {
        destDir.mkdirs();//from   w w w  .ja va 2  s .  c  o  m
        try {
            FileSystem fileSystem = FileSystem.get(conf);
            FileStatus[] globStatus = fileSystem.globStatus(sPath);
            for (FileStatus fss : globStatus) {
                if (fss.isFile()) {
                    Option optPath = SequenceFile.Reader.file(fss.getPath());
                    SequenceFile.Reader r = new SequenceFile.Reader(conf, optPath);

                    Text key = new Text();
                    Text val = new Text();

                    while (r.next(key, val)) {
                        File outputFile = new File(destDir, key.toString());
                        FileOutputStream fos = new FileOutputStream(outputFile);
                        InputStream is = IOUtils.toInputStream(val.toString());
                        IOUtils.copy(is, fos);
                        fos.flush();
                        fos.close();
                        numfilesread++;
                    }
                    r.close();
                }

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    } else {
        System.out.println("Destination should be a directory.");
    }
    return numfilesread;
}