List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:io.warp10.continuum.store.HFileStats.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); CacheConfig cacheConf = new CacheConfig(conf); FileSystem fs = FileSystem.newInstance(conf); FileStatus[] pathes = fs.globStatus(new Path(args[0])); long bytes = 0L; long cells = 0L; for (FileStatus status : pathes) { try {/*from ww w . ja v a 2 s . c o m*/ HFile.Reader reader = HFile.createReader(fs, status.getPath(), cacheConf, conf); bytes += reader.length(); cells += reader.getEntries(); System.out.println( status.getPath() + " >>> " + reader.length() + " bytes " + reader.getEntries() + " cells"); reader.close(); } catch (Exception e) { continue; } } System.out.println( "TOTAL: " + cells + " cells " + bytes + " bytes " + (bytes / (double) cells) + " bytes/cell"); long ts = System.currentTimeMillis(); System.out.println(ts * 1000 + "// hbase.bytes{} " + bytes); System.out.println(ts * 1000 + "// hbase.datapoints{} " + cells); }
From source file:it.crs4.seal.common.SealToolParser.java
License:Open Source License
/** * Parses command line.// w w w . j a va2 s . co m * * Override this method to implement additional command line options, * but do make sure you call this method to parse the default options. */ protected CommandLine parseOptions(Configuration conf, String[] args) throws ParseException, IOException { myconf = conf; setDefaultProperties(conf); // load settings from configuration file // first, parse the command line (in getRcFile) looking for an option overriding the default seal configuration file File configFile = getRcFile(args); if (configFile != null) loadConfig(conf, configFile); // now parse the entire command line using the default hadoop parser. Now // the user can override properties specified in the config file with properties // specified on the command line. CommandLine line = new GenericOptionsParser(conf, options, args).getCommandLine(); if (line == null) throw new ParseException("Error parsing command line"); // getCommandLine returns an null if there was a parsing error ////////////////////// input/output formats ////////////////////// // set the configuration property. Then, we'll check the property // to ensure it has a valid value, regardless of whether we just set it, // so that the check will also be valid if the property is set directly. if (line.hasOption(opt_inputFormat.getOpt())) myconf.set(INPUT_FORMAT_CONF, line.getOptionValue(opt_inputFormat.getOpt())); validateIOFormat(INPUT_FORMAT_CONF, acceptedInputFormats); if (line.hasOption(opt_outputFormat.getOpt())) myconf.set(OUTPUT_FORMAT_CONF, line.getOptionValue(opt_outputFormat.getOpt())); validateIOFormat(OUTPUT_FORMAT_CONF, acceptedOutputFormats); if (conf.get(INPUT_FORMAT_ENCODING) != null) { String value = conf.get(INPUT_FORMAT_ENCODING); if (value.equals("sanger") || value.equals("illumina")) conf.set(fi.tkk.ics.hadoop.bam.FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, value); else throw new ParseException("Invalid " + INPUT_FORMAT_ENCODING + ". Expected 'sanger' or 'illumina'"); } /////////////////////// output compression ///////////////////// if (line.hasOption(opt_compressOutput.getOpt())) { myconf.setBoolean("mapred.output.compress", true); String codec = line.getOptionValue(opt_compressOutput.getOpt()); if (codec != null) { String codecClass = "org.apache.hadoop.io.compress.GzipCodec"; // default if ("auto".equalsIgnoreCase(codec) || "gzip".equalsIgnoreCase(codec)) { // pass. Already set } else if ("bzip2".equalsIgnoreCase(codec)) codecClass = "org.apache.hadoop.io.compress.BZip2Codec"; else if ("snappy".equalsIgnoreCase(codec)) codecClass = "org.apache.hadoop.io.compress.SnappyCodec"; else { throw new ParseException("Unknown codec " + codec + ". Valid values are gzip, bzip2, snappy and auto.\n" + "If you want to use an unsupported codec pass 'auto' and set the property mapred.output.compression.codec directly"); } myconf.set("mapred.output.compression.codec", codecClass); } } ////////////////////// number of reducers ////////////////////// if (line.hasOption(opt_nReduceTasks.getOpt())) { String rString = line.getOptionValue(opt_nReduceTasks.getOpt()); try { int r = Integer.parseInt(rString); if (r >= minReduceTasks) nReduceTasks = r; else throw new ParseException("Number of reducers must be greater than or equal to " + minReduceTasks + " (got " + rString + ")"); } catch (NumberFormatException e) { throw new ParseException("Invalid number of reduce tasks '" + rString + "'"); } } ////////////////////// positional arguments ////////////////////// String[] otherArgs = line.getArgs(); if (otherArgs.length < 2) // require at least two: one input and one output throw new ParseException("You must provide input and output paths"); else { // FileSystem fs; for (int i = 0; i < otherArgs.length - 1; ++i) { Path p = new Path(otherArgs[i]); fs = p.getFileSystem(conf); p = p.makeQualified(fs); FileStatus[] files = fs.globStatus(p); if (files != null && files.length > 0) { for (FileStatus status : files) inputs.add(status.getPath()); } else throw new ParseException("Input path " + p.toString() + " doesn't exist"); } // now the last one, should be the output path outputDir = new Path(otherArgs[otherArgs.length - 1]); fs = outputDir.getFileSystem(conf); outputDir = outputDir.makeQualified(fs); if (fs.exists(outputDir)) throw new ParseException( "Output path " + outputDir.toString() + " already exists. Won't overwrite"); } return line; }
From source file:ml.shifu.shifu.fs.ShifuFileUtils.java
License:Apache License
/** * According to SourceType to check whether file exists. * * @param path - path of source file * @param sourceType - local/hdfs/*from w ww . j a va 2s .com*/ * @return - true if file exists, or false * @throws IOException - if any I/O exception in processing */ public static boolean isFileExists(String path, SourceType sourceType) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus[] fileStatusArr = fs.globStatus(new Path(path)); return !(fileStatusArr == null || fileStatusArr.length == 0); }
From source file:ml.shifu.shifu.fs.ShifuFileUtils.java
License:Apache License
/** * Expand the file path, allowing user to use regex just like when using `hadoop fs` * According the rules in glob, "{2,3}", "*" will be allowed * * @param rawPath - the raw file path that may contains regex * @param sourceType - file source [local/HDFS] * @return - the file path list after expansion * @throws IOException - if any I/O exception in processing *///from w w w. j a v a 2 s. c o m public static List<String> expandPath(String rawPath, SourceType sourceType) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus[] fsArr = fs.globStatus(new Path(rawPath)); List<String> filePathList = new ArrayList<String>(); if (fsArr != null) { for (FileStatus fileStatus : fsArr) { filePathList.add(fileStatus.getPath().toString()); } } return filePathList; }
From source file:ml.shifu.shifu.util.CommonUtils.java
License:Apache License
/** * Find the model files for some @ModelConfig. There is a little tricky about this function. * If @EvalConfig is specified, try to load the models according setting in @EvalConfig, * or if @EvalConfig is null or ModelsPath is blank, Shifu will try to load models under `models` * directory/*from w w w.j ava2s . c o m*/ * * @param modelConfig - @ModelConfig, need this, since the model file may exist in HDFS * @param evalConfig - @EvalConfig, maybe null * @param sourceType - Where is file system * @return - @FileStatus array for all found models * @throws IOException */ public static List<FileStatus> findModels(ModelConfig modelConfig, EvalConfig evalConfig, SourceType sourceType) throws IOException { FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType); PathFinder pathFinder = new PathFinder(modelConfig); // If the algorithm in ModelConfig is NN, we only load NN models // the same as SVM, LR String modelSuffix = "." + modelConfig.getAlgorithm().toLowerCase(); List<FileStatus> fileList = new ArrayList<FileStatus>(); if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) { Path path = new Path(pathFinder.getModelsPath(sourceType)); fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix)))); } else { String modelsPath = evalConfig.getModelsPath(); FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath)); if (ArrayUtils.isNotEmpty(expandedPaths)) { for (FileStatus epath : expandedPaths) { fileList.addAll( Arrays.asList(fs.listStatus(epath.getPath(), new FileSuffixPathFilter(modelSuffix)))); } } } return fileList; }
From source file:ml.shifu.shifu.util.ModelSpecLoaderUtils.java
License:Apache License
/** * Find the model files for some @ModelConfig. There is a little tricky about this function. * If @EvalConfig is specified, try to load the models according setting in @EvalConfig, * or if {@link EvalConfig} is null or modelsPath is blank, Shifu will try to load models under `models` * directory// w w w . j a va 2 s.c o m * * @param modelConfig * - {@link ModelConfig}, need this, since the model file may exist in HDFS * @param evalConfig * - {@link EvalConfig}, maybe null * @param sourceType * - Where is file system * @return - {@link FileStatus} array for all found models * @throws IOException * io exception to load files */ public static List<FileStatus> findModels(ModelConfig modelConfig, EvalConfig evalConfig, SourceType sourceType) throws IOException { FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType); PathFinder pathFinder = new PathFinder(modelConfig); // If the algorithm in ModelConfig is NN, we only load NN models // the same as SVM, LR String modelSuffix = "." + modelConfig.getAlgorithm().toLowerCase(); List<FileStatus> fileList = new ArrayList<>(); if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) { Path path = new Path(pathFinder.getModelsPath(sourceType)); fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix)))); } else { String modelsPath = evalConfig.getModelsPath(); FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath)); if (ArrayUtils.isNotEmpty(expandedPaths)) { for (FileStatus fileStatus : expandedPaths) { fileList.addAll(Arrays.asList(fs.listStatus(fileStatus.getPath(), // list all files new FileSuffixPathFilter(modelSuffix)))); } } } return fileList; }
From source file:ml.shifu.shifu.util.ModelSpecLoaderUtils.java
License:Apache License
/** * Load the generic model config and parse it to java object. * Similar as {@link #findModels(ModelConfig, EvalConfig, RawSourceData.SourceType)} * //from w w w . j a v a2 s . c o m * @param modelConfig * - {@link ModelConfig}, need this, since the model file may exist in HDFS * @param evalConfig * - {@link EvalConfig}, maybe null * @param sourceType * - {@link SourceType}, HDFS or Local? * @return the file status list for generic models * @throws IOException * Exception occurred when finding generic models */ public static List<FileStatus> findGenericModels(ModelConfig modelConfig, EvalConfig evalConfig, RawSourceData.SourceType sourceType) throws IOException { FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType); PathFinder pathFinder = new PathFinder(modelConfig); // Find generic model config file with suffix .json String modelSuffix = ".json"; List<FileStatus> fileList = new ArrayList<>(); if (null == evalConfig || StringUtils.isBlank(evalConfig.getModelsPath())) { Path path = new Path(pathFinder.getModelsPath(sourceType)); // modelsPath / <ModelName> // + File.separator + modelConfig.getBasic().getName()); fileList.addAll(Arrays.asList(fs.listStatus(path, new FileSuffixPathFilter(modelSuffix)))); } else { String modelsPath = evalConfig.getModelsPath(); FileStatus[] expandedPaths = fs.globStatus(new Path(modelsPath)); // models / <ModelName> // + File.separator + modelConfig.getBasic().getName())); if (ArrayUtils.isNotEmpty(expandedPaths)) { for (FileStatus epath : expandedPaths) { fileList.addAll(Arrays.asList(fs.listStatus(epath.getPath(), // list all files with suffix new FileSuffixPathFilter(modelSuffix)))); } } } return fileList; }
From source file:net.myrrix.batch.common.iterator.sequencefile.SequenceFileDirIterator.java
License:Apache License
/** * Constructor that uses either {@link FileSystem#listStatus(Path)} or * {@link FileSystem#globStatus(Path)} to obtain list of files to iterate over * (depending on pathType parameter).//from w w w. j a v a 2 s . co m */ public SequenceFileDirIterator(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering, final boolean reuseKeyValueInstances, final Configuration conf) throws IOException { FileStatus[] statuses; FileSystem fs = path.getFileSystem(conf); if (filter == null) { statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path); } else { statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter); } if (statuses == null) { statuses = NO_STATUSES; } else { if (ordering == null) { // If order does not matter, use a random order Collections.shuffle(Arrays.asList(statuses)); } else { Arrays.sort(statuses, ordering); } } closer = Closer.create(); Iterator<Iterator<Pair<K, V>>> fsIterators = Iterators.transform(Iterators.forArray(statuses), new Function<FileStatus, Iterator<Pair<K, V>>>() { @Override public Iterator<Pair<K, V>> apply(FileStatus from) { try { SequenceFileIterator<K, V> iterator = new SequenceFileIterator<K, V>(from.getPath(), reuseKeyValueInstances, conf); closer.register(iterator); return iterator; } catch (IOException ioe) { throw new IllegalStateException(from.getPath().toString(), ioe); } } }); delegate = Iterators.concat(fsIterators); }
From source file:net.sf.katta.indexing.IndexerJob.java
License:Apache License
public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException { // create job conf with class pointing into job jar. JobConf jobConf = new JobConf(IndexerJob.class); jobConf.setJobName("indexer"); jobConf.setMapRunnerClass(Indexer.class); // alternative use a text file and a TextInputFormat jobConf.setInputFormat(SequenceFileInputFormat.class); Path input = new Path(path); FileInputFormat.setInputPaths(jobConf, input); // we just set the output path to make hadoop happy. FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination)); // setting the folder where lucene indexes will be copied when finished. jobConf.set("finalDestination", finalDestination); // important to switch spec exec off. // We dont want to have something duplicated. jobConf.setSpeculativeExecution(false); // The num of map tasks is equal to the num of input splits. // The num of input splits by default is equal to the num of hdf blocks // for the input file(s). To get the right num of shards we need to // calculate the best input split size. FileSystem fs = FileSystem.get(input.toUri(), jobConf); FileStatus[] status = fs.globStatus(input); long size = 0; for (FileStatus fileStatus : status) { size += fileStatus.getLen();//from ww w . j a v a2 s.co m } long optimalSplisize = size / numOfShards; jobConf.set("mapred.min.split.size", "" + optimalSplisize); // give more mem to lucene tasks. jobConf.set("mapred.child.java.opts", "-Xmx2G"); jobConf.setNumMapTasks(1); jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); }
From source file:nl.surfsara.newsreader.loader.ReadNewsreaderDocs.java
License:Apache License
@Override public Long run() { long numfilesread = 0; Path sPath = new Path(source); File destDir = new File(dest); if (destDir.isDirectory()) { destDir.mkdirs();//from w w w .ja va 2 s . c o m try { FileSystem fileSystem = FileSystem.get(conf); FileStatus[] globStatus = fileSystem.globStatus(sPath); for (FileStatus fss : globStatus) { if (fss.isFile()) { Option optPath = SequenceFile.Reader.file(fss.getPath()); SequenceFile.Reader r = new SequenceFile.Reader(conf, optPath); Text key = new Text(); Text val = new Text(); while (r.next(key, val)) { File outputFile = new File(destDir, key.toString()); FileOutputStream fos = new FileOutputStream(outputFile); InputStream is = IOUtils.toInputStream(val.toString()); IOUtils.copy(is, fos); fos.flush(); fos.close(); numfilesread++; } r.close(); } } } catch (Exception e) { e.printStackTrace(); } } else { System.out.println("Destination should be a directory."); } return numfilesread; }