Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:io.warp10.continuum.store.HFileStats.java

License:Apache License

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    CacheConfig cacheConf = new CacheConfig(conf);

    FileSystem fs = FileSystem.newInstance(conf);

    FileStatus[] pathes = fs.globStatus(new Path(args[0]));

    long bytes = 0L;
    long cells = 0L;

    for (FileStatus status : pathes) {
        try {/*from ww  w .  ja v  a 2  s  .  c o m*/
            HFile.Reader reader = HFile.createReader(fs, status.getPath(), cacheConf, conf);
            bytes += reader.length();
            cells += reader.getEntries();

            System.out.println(
                    status.getPath() + " >>> " + reader.length() + " bytes " + reader.getEntries() + " cells");

            reader.close();
        } catch (Exception e) {
            continue;
        }
    }

    System.out.println(
            "TOTAL: " + cells + " cells " + bytes + " bytes " + (bytes / (double) cells) + " bytes/cell");

    long ts = System.currentTimeMillis();

    System.out.println(ts * 1000 + "// hbase.bytes{} " + bytes);
    System.out.println(ts * 1000 + "// hbase.datapoints{} " + cells);
}

From source file:it.crs4.seal.common.SealToolParser.java

License:Open Source License

/**
 * Parses command line.// w w  w .  j a  va2 s . co  m
 *
 * Override this method to implement additional command line options,
 * but do make sure you call this method to parse the default options.
 */
protected CommandLine parseOptions(Configuration conf, String[] args) throws ParseException, IOException {
    myconf = conf;

    setDefaultProperties(conf);

    // load settings from configuration file
    // first, parse the command line (in getRcFile) looking for an option overriding the default seal configuration file
    File configFile = getRcFile(args);
    if (configFile != null)
        loadConfig(conf, configFile);

    // now parse the entire command line using the default hadoop parser.  Now
    // the user can override properties specified in the config file with properties
    // specified on the command line.
    CommandLine line = new GenericOptionsParser(conf, options, args).getCommandLine();
    if (line == null)
        throw new ParseException("Error parsing command line"); // getCommandLine returns an null if there was a parsing error

    ////////////////////// input/output formats //////////////////////
    // set the configuration property.  Then, we'll check the property
    // to ensure it has a valid value, regardless of whether we just set it,
    // so that the check will also be valid if the property is set directly.
    if (line.hasOption(opt_inputFormat.getOpt()))
        myconf.set(INPUT_FORMAT_CONF, line.getOptionValue(opt_inputFormat.getOpt()));

    validateIOFormat(INPUT_FORMAT_CONF, acceptedInputFormats);

    if (line.hasOption(opt_outputFormat.getOpt()))
        myconf.set(OUTPUT_FORMAT_CONF, line.getOptionValue(opt_outputFormat.getOpt()));

    validateIOFormat(OUTPUT_FORMAT_CONF, acceptedOutputFormats);

    if (conf.get(INPUT_FORMAT_ENCODING) != null) {
        String value = conf.get(INPUT_FORMAT_ENCODING);
        if (value.equals("sanger") || value.equals("illumina"))
            conf.set(fi.tkk.ics.hadoop.bam.FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, value);
        else
            throw new ParseException("Invalid " + INPUT_FORMAT_ENCODING + ". Expected 'sanger' or 'illumina'");
    }

    /////////////////////// output compression /////////////////////
    if (line.hasOption(opt_compressOutput.getOpt())) {
        myconf.setBoolean("mapred.output.compress", true);
        String codec = line.getOptionValue(opt_compressOutput.getOpt());
        if (codec != null) {
            String codecClass = "org.apache.hadoop.io.compress.GzipCodec"; // default
            if ("auto".equalsIgnoreCase(codec) || "gzip".equalsIgnoreCase(codec)) {
                // pass.  Already set
            } else if ("bzip2".equalsIgnoreCase(codec))
                codecClass = "org.apache.hadoop.io.compress.BZip2Codec";
            else if ("snappy".equalsIgnoreCase(codec))
                codecClass = "org.apache.hadoop.io.compress.SnappyCodec";
            else {
                throw new ParseException("Unknown codec " + codec
                        + ". Valid values are gzip, bzip2, snappy and auto.\n"
                        + "If you want to use an unsupported codec pass 'auto' and set the property mapred.output.compression.codec directly");
            }

            myconf.set("mapred.output.compression.codec", codecClass);
        }
    }

    ////////////////////// number of reducers //////////////////////
    if (line.hasOption(opt_nReduceTasks.getOpt())) {
        String rString = line.getOptionValue(opt_nReduceTasks.getOpt());
        try {
            int r = Integer.parseInt(rString);
            if (r >= minReduceTasks)
                nReduceTasks = r;
            else
                throw new ParseException("Number of reducers must be greater than or equal to " + minReduceTasks
                        + " (got " + rString + ")");
        } catch (NumberFormatException e) {
            throw new ParseException("Invalid number of reduce tasks '" + rString + "'");
        }
    }

    ////////////////////// positional arguments //////////////////////
    String[] otherArgs = line.getArgs();
    if (otherArgs.length < 2) // require at least two:  one input and one output
        throw new ParseException("You must provide input and output paths");
    else {
        //
        FileSystem fs;
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            Path p = new Path(otherArgs[i]);
            fs = p.getFileSystem(conf);
            p = p.makeQualified(fs);
            FileStatus[] files = fs.globStatus(p);
            if (files != null && files.length > 0) {
                for (FileStatus status : files)
                    inputs.add(status.getPath());
            } else
                throw new ParseException("Input path " + p.toString() + " doesn't exist");
        }
        // now the last one, should be the output path
        outputDir = new Path(otherArgs[otherArgs.length - 1]);
        fs = outputDir.getFileSystem(conf);
        outputDir = outputDir.makeQualified(fs);
        if (fs.exists(outputDir))
            throw new ParseException(
                    "Output path " + outputDir.toString() + " already exists.  Won't overwrite");
    }

    return line;
}

From source file:ml.shifu.shifu.fs.ShifuFileUtils.java

License:Apache License

/**
 * According to SourceType to check whether file exists.
 *
 * @param path       - path of source file
 * @param sourceType - local/hdfs/*from w ww . j  a  va 2s  .com*/
 * @return - true if file exists, or false
 * @throws IOException -  if any I/O exception in processing
 */
public static boolean isFileExists(String path, SourceType sourceType) throws IOException {
    FileSystem fs = getFileSystemBySourceType(sourceType);
    FileStatus[] fileStatusArr = fs.globStatus(new Path(path));
    return !(fileStatusArr == null || fileStatusArr.length == 0);
}

From source file:ml.shifu.shifu.fs.ShifuFileUtils.java

License:Apache License

/**
 * Expand the file path, allowing user to use regex just like when using `hadoop fs`
 * According the rules in glob, "{2,3}", "*" will be allowed
 *
 * @param rawPath    - the raw file path that may contains regex
 * @param sourceType - file source [local/HDFS]
 * @return - the file path list after expansion
 * @throws IOException -  if any I/O exception in processing
 *///from  w w w. j a v a 2  s.  c  o m
public static List<String> expandPath(String rawPath, SourceType sourceType) throws IOException {
    FileSystem fs = getFileSystemBySourceType(sourceType);
    FileStatus[] fsArr = fs.globStatus(new Path(rawPath));

    List<String> filePathList = new ArrayList<String>();
    if (fsArr != null) {
        for (FileStatus fileStatus : fsArr) {
            filePathList.add(fileStatus.getPath().toString());
        }
    }

    return filePathList;
}

From source file:ml.shifu.shifu.util.CommonUtils.java

License:Apache License

/**
 * Find the model files for some @ModelConfig. There is a little tricky about this function.
 * If @EvalConfig is specified, try to load the models according setting in @EvalConfig,
 * or if @EvalConfig is null or ModelsPath is blank, Shifu will try to load models under `models`
 * directory/*from  w  w w.j ava2s . c  o m*/
 *
 * @param modelConfig - @ModelConfig, need this, since the model file may exist in HDFS
 * @param evalConfig  - @EvalConfig, maybe null
 * @param sourceType  - Where is file system
 * @return - @FileStatus array for all found models
 * @throws IOException
 */
public static List<FileStatus> findModels(ModelConfig modelConfig, EvalConfig evalConfig, SourceType sourceType)
        throws IOException {
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    PathFinder pathFinder = new PathFinder(modelConfig);

    // If the algorithm in ModelConfig is NN, we only load NN models
    // the same as SVM, LR
    String modelSuffix = "." + modelConfig.getAlgorithm().toLowerCase();

    List<FileStatus> fileList = new ArrayList<FileStatus>();
    if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) {
        Path path = new Path(pathFinder.getModelsPath(sourceType));
        fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix))));
    } else {
        String modelsPath = evalConfig.getModelsPath();
        FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath));
        if (ArrayUtils.isNotEmpty(expandedPaths)) {
            for (FileStatus epath : expandedPaths) {
                fileList.addAll(
                        Arrays.asList(fs.listStatus(epath.getPath(), new FileSuffixPathFilter(modelSuffix))));
            }
        }
    }

    return fileList;
}

From source file:ml.shifu.shifu.util.ModelSpecLoaderUtils.java

License:Apache License

/**
 * Find the model files for some @ModelConfig. There is a little tricky about this function.
 * If @EvalConfig is specified, try to load the models according setting in @EvalConfig,
 * or if {@link EvalConfig} is null or modelsPath is blank, Shifu will try to load models under `models`
 * directory// w  w w  . j  a  va  2  s.c  o m
 *
 * @param modelConfig
 *            - {@link ModelConfig}, need this, since the model file may exist in HDFS
 * @param evalConfig
 *            - {@link EvalConfig}, maybe null
 * @param sourceType
 *            - Where is file system
 * @return - {@link FileStatus} array for all found models
 * @throws IOException
 *             io exception to load files
 */
public static List<FileStatus> findModels(ModelConfig modelConfig, EvalConfig evalConfig, SourceType sourceType)
        throws IOException {
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    PathFinder pathFinder = new PathFinder(modelConfig);

    // If the algorithm in ModelConfig is NN, we only load NN models
    // the same as SVM, LR
    String modelSuffix = "." + modelConfig.getAlgorithm().toLowerCase();

    List<FileStatus> fileList = new ArrayList<>();
    if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) {
        Path path = new Path(pathFinder.getModelsPath(sourceType));
        fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix))));
    } else {
        String modelsPath = evalConfig.getModelsPath();
        FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath));
        if (ArrayUtils.isNotEmpty(expandedPaths)) {
            for (FileStatus fileStatus : expandedPaths) {
                fileList.addAll(Arrays.asList(fs.listStatus(fileStatus.getPath(), // list all files
                        new FileSuffixPathFilter(modelSuffix))));
            }
        }
    }

    return fileList;
}

From source file:ml.shifu.shifu.util.ModelSpecLoaderUtils.java

License:Apache License

/**
 * Load the generic model config and parse it to java object.
 * Similar as {@link #findModels(ModelConfig, EvalConfig, RawSourceData.SourceType)}
 * //from  w w w .  j a v  a2 s . c  o  m
 * @param modelConfig
 *            - {@link ModelConfig}, need this, since the model file may exist in HDFS
 * @param evalConfig
 *            - {@link EvalConfig}, maybe null
 * @param sourceType
 *            - {@link SourceType}, HDFS or Local?
 * @return the file status list for generic models
 * @throws IOException
 *             Exception occurred when finding generic models
 */
public static List<FileStatus> findGenericModels(ModelConfig modelConfig, EvalConfig evalConfig,
        RawSourceData.SourceType sourceType) throws IOException {
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    PathFinder pathFinder = new PathFinder(modelConfig);

    // Find generic model config file with suffix .json
    String modelSuffix = ".json";

    List<FileStatus> fileList = new ArrayList<>();
    if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) {
        Path path = new Path(pathFinder.getModelsPath(sourceType)); // modelsPath / <ModelName>
        // + File.separator + modelConfig.getBasic().getName());
        fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix))));
    } else {
        String modelsPath = evalConfig.getModelsPath();
        FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath)); // models / <ModelName>
        // + File.separator + modelConfig.getBasic().getName()));
        if (ArrayUtils.isNotEmpty(expandedPaths)) {
            for (FileStatus epath : expandedPaths) {
                fileList.addAll(Arrays.asList(fs.listStatus(epath.getPath(), // list all files with suffix
                        new FileSuffixPathFilter(modelSuffix))));
            }
        }
    }

    return fileList;
}

From source file:net.myrrix.batch.common.iterator.sequencefile.SequenceFileDirIterator.java

License:Apache License

/**
 * Constructor that uses either {@link FileSystem#listStatus(Path)} or
 * {@link FileSystem#globStatus(Path)} to obtain list of files to iterate over
 * (depending on pathType parameter).//from   w w w. j  a  v  a 2  s .  co m
 */
public SequenceFileDirIterator(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering,
        final boolean reuseKeyValueInstances, final Configuration conf) throws IOException {

    FileStatus[] statuses;
    FileSystem fs = path.getFileSystem(conf);
    if (filter == null) {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path);
    } else {
        statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter);
    }

    if (statuses == null) {
        statuses = NO_STATUSES;
    } else {
        if (ordering == null) {
            // If order does not matter, use a random order
            Collections.shuffle(Arrays.asList(statuses));
        } else {
            Arrays.sort(statuses, ordering);
        }
    }

    closer = Closer.create();

    Iterator<Iterator<Pair<K, V>>> fsIterators = Iterators.transform(Iterators.forArray(statuses),
            new Function<FileStatus, Iterator<Pair<K, V>>>() {
                @Override
                public Iterator<Pair<K, V>> apply(FileStatus from) {
                    try {
                        SequenceFileIterator<K, V> iterator = new SequenceFileIterator<K, V>(from.getPath(),
                                reuseKeyValueInstances, conf);
                        closer.register(iterator);
                        return iterator;
                    } catch (IOException ioe) {
                        throw new IllegalStateException(from.getPath().toString(), ioe);
                    }
                }
            });

    delegate = Iterators.concat(fsIterators);
}

From source file:net.sf.katta.indexing.IndexerJob.java

License:Apache License

public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException {
    // create job conf with class pointing into job jar.
    JobConf jobConf = new JobConf(IndexerJob.class);
    jobConf.setJobName("indexer");
    jobConf.setMapRunnerClass(Indexer.class);
    // alternative use a text file and a TextInputFormat
    jobConf.setInputFormat(SequenceFileInputFormat.class);

    Path input = new Path(path);
    FileInputFormat.setInputPaths(jobConf, input);
    // we just set the output path to make hadoop happy.
    FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination));
    // setting the folder where lucene indexes will be copied when finished.
    jobConf.set("finalDestination", finalDestination);
    // important to switch spec exec off.
    // We dont want to have something duplicated.
    jobConf.setSpeculativeExecution(false);

    // The num of map tasks is equal to the num of input splits.
    // The num of input splits by default is equal to the num of hdf blocks
    // for the input file(s). To get the right num of shards we need to
    // calculate the best input split size.

    FileSystem fs = FileSystem.get(input.toUri(), jobConf);
    FileStatus[] status = fs.globStatus(input);
    long size = 0;
    for (FileStatus fileStatus : status) {
        size += fileStatus.getLen();//from  ww w  . j  a  v a2 s.co  m
    }
    long optimalSplisize = size / numOfShards;
    jobConf.set("mapred.min.split.size", "" + optimalSplisize);

    // give more mem to lucene tasks.
    jobConf.set("mapred.child.java.opts", "-Xmx2G");
    jobConf.setNumMapTasks(1);
    jobConf.setNumReduceTasks(0);
    JobClient.runJob(jobConf);
}

From source file:nl.surfsara.newsreader.loader.ReadNewsreaderDocs.java

License:Apache License

@Override
public Long run() {
    long numfilesread = 0;
    Path sPath = new Path(source);
    File destDir = new File(dest);
    if (destDir.isDirectory()) {
        destDir.mkdirs();//from   w w w  .ja va 2  s .  c  o  m
        try {
            FileSystem fileSystem = FileSystem.get(conf);
            FileStatus[] globStatus = fileSystem.globStatus(sPath);
            for (FileStatus fss : globStatus) {
                if (fss.isFile()) {
                    Option optPath = SequenceFile.Reader.file(fss.getPath());
                    SequenceFile.Reader r = new SequenceFile.Reader(conf, optPath);

                    Text key = new Text();
                    Text val = new Text();

                    while (r.next(key, val)) {
                        File outputFile = new File(destDir, key.toString());
                        FileOutputStream fos = new FileOutputStream(outputFile);
                        InputStream is = IOUtils.toInputStream(val.toString());
                        IOUtils.copy(is, fos);
                        fos.flush();
                        fos.close();
                        numfilesread++;
                    }
                    r.close();
                }

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    } else {
        System.out.println("Destination should be a directory.");
    }
    return numfilesread;
}