Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:acromusashi.stream.bolt.hdfs.HdfsPreProcessor.java

License:Open Source License

/**
 * HDFSSink????<br>/*w w  w  .  ja v  a2  s  .co  m*/
 * ?????????????????
 * 
 * @param hdfs 
 * @param baseUrl URL
 * @param baseName ??
 * @param tmpSuffix ??
 */
public static void execute(FileSystem hdfs, String baseUrl, String baseName, String tmpSuffix) {
    String baseRealUrl = baseUrl;

    if (baseRealUrl.endsWith("/") == false) {
        baseRealUrl = baseRealUrl + "/";
    }

    String targetPattern = baseRealUrl + baseName + "[0-9]*" + tmpSuffix + "*";
    Path targetPathPattern = new Path(targetPattern);

    FileStatus[] targetTmpFiles = null;

    try {
        targetTmpFiles = hdfs.globStatus(targetPathPattern);
    } catch (IOException ioex) {
        logger.warn("Failed to search preprocess target files. Skip preprocess.", ioex);
        return;
    }

    if (targetTmpFiles.length == 0) {
        String logFormat = "Preprocess target files not exist. Path={0}";
        String logMessage = MessageFormat.format(logFormat, targetPattern);
        logger.info(logMessage);
        return;
    }

    if (logger.isInfoEnabled() == true) {
        printTargetPathList(targetTmpFiles);
    }

    for (FileStatus targetTmpFile : targetTmpFiles) {
        renameTmpFile(hdfs, targetTmpFile.getPath().toString(), tmpSuffix);
    }

}

From source file:bigfat.hadoop.HDFSDirInputStream.java

License:Apache License

/**
 * Create a input stream that will read through all the files in one
 * directory note that the file will be sorted by name, using the
 * comparator.//from   w w  w  .j  a  v a  2 s  .c om
 * 
 * @param fs
 * @param dir
 * @param comp
 * @throws IOException
 */
public HDFSDirInputStream(FileSystem fs, String dir, Comparator<String> comp) throws IOException {
    this.fs = fs;
    Path p = new Path(dir);
    FileStatus fstate = fs.getFileStatus(p);
    if (fstate.isDir()) {
        FileStatus[] child = fs.globStatus(new Path(dir + "/*"));
        LinkedList<String> s = new LinkedList<String>();
        Map<String, Path> map = new HashMap<String, Path>();
        for (FileStatus c : child) {
            if (c.isDir())
                continue;
            map.put(c.getPath().getName(), c.getPath());
            s.add(c.getPath().getName());
        }
        if (comp != null)
            Collections.sort(s, comp);
        else
            Collections.sort(s);
        Iterator<String> it = s.iterator();
        while (it.hasNext()) {
            String n = it.next();
            Path pr = map.get(n);
            this.appendFile(pr.toString());
        }
    } else {
        this.appendFile(dir);
    }
}

From source file:cascading.tap.GlobHfs.java

License:Open Source License

private Tap[] makeTaps(JobConf conf) throws IOException {
    FileStatus[] statusList = null;/*  w ww.  jav a  2 s  .co  m*/

    Path path = new Path(pathPattern);

    FileSystem fileSystem = path.getFileSystem(conf);

    if (pathFilter == null)
        statusList = fileSystem.globStatus(path);
    else
        statusList = fileSystem.globStatus(path, pathFilter);

    if (statusList == null || statusList.length == 0)
        throw new TapException("unable to find paths matching path pattern: " + pathPattern);

    List<Hfs> notEmpty = new ArrayList<Hfs>();

    for (int i = 0; i < statusList.length; i++) {
        // remove empty files. turns out a directory returns a length not zero
        // so this jives with the expectations set in the above javadoc
        if (statusList[i].getLen() != 0)
            notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString()));
    }

    if (notEmpty.isEmpty())
        throw new TapException("all paths matching path pattern are zero length: " + pathPattern);

    return notEmpty.toArray(new Tap[notEmpty.size()]);
}

From source file:cascading.tap.hadoop.BaseDistCacheTap.java

License:Open Source License

@Override
public TupleEntryIterator openForRead(FlowProcess<? extends Configuration> flowProcess, RecordReader input)
        throws IOException {
    // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is provided
    if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) {
        LOG.info("delegating to parent");
        return super.openForRead(flowProcess, input);
    }//from ww  w  .  java  2  s . com

    Path[] cachedFiles = getLocalCacheFiles(flowProcess);

    if (cachedFiles == null || cachedFiles.length == 0)
        return super.openForRead(flowProcess, null);

    List<Path> paths = new ArrayList<>();
    List<Tap> taps = new ArrayList<>();

    if (isSimpleGlob()) {
        FileSystem fs = FileSystem.get(flowProcess.getConfig());
        FileStatus[] statuses = fs.globStatus(getHfs().getPath());

        for (FileStatus status : statuses)
            paths.add(status.getPath());
    } else {
        paths.add(getHfs().getPath());
    }

    for (Path pathToFind : paths) {
        for (Path path : cachedFiles) {
            if (path.toString().endsWith(pathToFind.getName())) {
                LOG.info("found {} in distributed cache", path);
                taps.add(new Lfs(getScheme(), path.toString()));
            }
        }
    }

    if (paths.isEmpty()) // not in cache, read from HDFS
    {
        LOG.info("could not find files in local resource path. delegating to parent: {}",
                super.getIdentifier());
        return super.openForRead(flowProcess, input);
    }

    return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input);
}

From source file:cascading.tap.hadoop.BaseDistCacheTap.java

License:Open Source License

private void registerHfs(FlowProcess<? extends Configuration> process, Configuration conf, Hfs hfs)
        throws IOException {
    if (isSimpleGlob()) {
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] statuses = fs.globStatus(getHfs().getPath());

        if (statuses == null || statuses.length == 0)
            throw new TapException(String.format(
                    "glob expression %s does not match any files on the filesystem", getHfs().getPath()));

        for (FileStatus fileStatus : statuses)
            registerURI(conf, fileStatus.getPath());
    } else {/*from  w ww  . j a  va2 s. co m*/
        registerURI(conf, hfs.getPath());
    }

    hfs.sourceConfInitComplete(process, conf);
}

From source file:cascading.tap.hadoop.GlobHfs.java

License:Open Source License

private Hfs[] makeTaps(Configuration conf) throws IOException {
    FileStatus[] statusList;/*from  www. ja v  a2 s .c  o m*/

    Path path = new Path(pathPattern);

    FileSystem fileSystem = path.getFileSystem(conf);

    if (pathFilter == null)
        statusList = fileSystem.globStatus(path);
    else
        statusList = fileSystem.globStatus(path, pathFilter);

    if (statusList == null || statusList.length == 0)
        throw new TapException("unable to find paths matching path pattern: " + pathPattern);

    List<Hfs> notEmpty = new ArrayList<Hfs>();

    for (int i = 0; i < statusList.length; i++) {
        // remove empty files. some hadoop versions return non-zero for dirs
        // so this jives with the expectations set in the above javadoc
        if (statusList[i].isDir() || statusList[i].getLen() != 0)
            notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString()));
    }

    if (notEmpty.isEmpty())
        throw new TapException(
                "all paths matching path pattern are zero length and not directories: " + pathPattern);

    return notEmpty.toArray(new Hfs[notEmpty.size()]);
}

From source file:com.anhth12.lambda.BatchUpdateFunction.java

@Override
public Void call(JavaPairRDD<K, M> newData, Time timestamp) throws Exception {
    if (newData.take(1).isEmpty()) {
        log.info("No data in current generation's RDD; nothing to do");
        return null;
    }/*from w  w  w . j  a  va2 s .  co m*/

    log.info("Beginning update at {}", timestamp);

    Configuration hadoopConf = sparkContext.hadoopConfiguration();

    JavaPairRDD<K, M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);

    if (inputPathStatuses == null || inputPathStatuses.length == 0) {
        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;
    } else {
        log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));
        JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass,
                        messageWritableClass);
        pastData = pastWriteableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));

    }

    try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) {
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString,
                producer);
    }
    return null;
}

From source file:com.anhth12.lambda.BatchUpdateFunction2.java

@Override
public Void call(JavaRDD<MessageAndMetadata> newData, Time timestamp) throws Exception {
    if (newData.take(1).isEmpty()) {
        log.info("No data in current generation's RDD; nothing to do");
        return null;
    }/*from   w  w w  .  jav a 2 s.c  om*/

    log.info("Beginning update at {}", timestamp);

    JavaPairRDD<K, M> newDataKM = newData.mapToPair(new PairFunction<MessageAndMetadata, K, M>() {

        @Override
        public Tuple2<K, M> call(MessageAndMetadata t) throws Exception {

            return (Tuple2<K, M>) new Tuple2<>(new String(t.getKey()), new String(t.getPayload()));
        }
    });

    Configuration hadoopConf = sparkContext.hadoopConfiguration();

    JavaPairRDD<K, M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);

    if (inputPathStatuses == null || inputPathStatuses.length == 0) {
        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;
    } else {
        log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));
        JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass,
                        messageWritableClass);
        pastData = pastWriteableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));

    }
    try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) {
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newDataKM, pastData, modelDirString,
                producer);
    }
    return null;

}

From source file:com.asakusafw.bulkloader.cache.CacheBuildTest.java

License:Apache License

private List<TestDataModel> collect(CacheStorage storage, Path contents) throws IOException {
    List<TestDataModel> results = new ArrayList<>();
    FileSystem fs = storage.getFileSystem();
    for (FileStatus status : fs.globStatus(contents)) {
        results.addAll(collectContent(fs, status));
    }/*from w ww. j ava  2s.  c om*/
    Collections.sort(results);
    return results;
}

From source file:com.asakusafw.bulkloader.collector.ExportFileSend.java

License:Apache License

/**
 * ????TSV??/*from   w ww.  j av  a  2 s.  c  o  m*/
 * {@link com.asakusafw.bulkloader.transfer.FileList.Writer}????
 * @param <T> ?
 * @param targetTableModel Export??Model?
 * @param filePath Export
 * @param writer ?Writer
 * @param tableName ??
 * @return ?????????????????? -1
 * @throws BulkLoaderSystemException ??????
 */
protected <T extends Writable> long send(Class<T> targetTableModel, String filePath, FileList.Writer writer,
        String tableName) throws BulkLoaderSystemException {
    FileSystem fs = null;
    String fileName = null;

    // ??
    long maxSize = Long.parseLong(ConfigurationLoader.getProperty(Constants.PROP_KEY_EXP_LOAD_MAX_SIZE));

    try {
        TsvIoFactory<T> factory = new TsvIoFactory<>(targetTableModel);
        Configuration conf = new Configuration();
        fs = FileSystem.get(new URI(filePath), conf);

        // ?????
        FileStatus[] status = fs.globStatus(new Path(filePath));
        Path[] listedPaths = FileUtil.stat2Paths(status);
        if (listedPaths == null) {
            LOG.info("TG-COLLECTOR-02006", tableName, filePath);
            return -1;
        } else {
            LOG.info("TG-COLLECTOR-02007", listedPaths.length, tableName, filePath);
        }
        long count = 0;
        boolean addEntry = false;
        for (Path path : listedPaths) {
            // ?????
            if (isSystemFile(path)) {
                continue;
            }

            // TODO ????
            // ??????
            ModelInput<T> input = TemporaryStorage.openInput(conf, targetTableModel, path);
            try {
                while (true) {
                    // 
                    addEntry = true;
                    fileName = FileNameUtil.createSendExportFileName(tableName, fileNameMap);
                    OutputStream output = writer.openNext(FileList.content(fileName));
                    try {
                        CountingOutputStream counter = new CountingOutputStream(output);
                        ModelOutput<T> modelOut = factory.createModelOutput(counter);
                        T model = factory.createModelObject();
                        LOG.info("TG-COLLECTOR-02004", tableName, path.toString(), fileName);

                        // ???ModelTSV??
                        boolean nextFile = false;
                        while (input.readTo(model)) {
                            // Modol???
                            modelOut.write(model);
                            count++;
                            // ???????
                            // char?byte?????????
                            // ??????(????)
                            if (counter.getByteCount() > maxSize) {
                                nextFile = true;
                                break;
                            }
                        }
                        modelOut.close();
                        LOG.info("TG-COLLECTOR-02005", tableName, path.toString(), fileName);

                        if (nextFile) {
                            // ???????
                            continue;
                        } else {
                            // ????????
                            break;
                        }
                    } finally {
                        output.close();
                    }
                }
            } finally {
                input.close();
            }
        }
        if (addEntry) {
            return count;
        } else {
            assert count == 0;
            return -1;
        }
    } catch (IOException e) {
        throw new BulkLoaderSystemException(e, getClass(), "TG-COLLECTOR-02001", MessageFormat
                .format("HDFS?{0} ???{1}", filePath, fileName));
    } catch (URISyntaxException e) {
        throw new BulkLoaderSystemException(e, getClass(), "TG-COLLECTOR-02001",
                MessageFormat.format("HDFS???HDFS?{0}", filePath));
    } finally {
        if (fs != null) {
            try {
                fs.close();
            } catch (IOException e) {
                throw new BulkLoaderSystemException(e, this.getClass(), "TG-COLLECTOR-02001",
                        MessageFormat.format(
                                "HDFS???URI{0}",
                                filePath));
            }
        }
    }
}