Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:acromusashi.stream.bolt.hdfs.HdfsPreProcessor.java

License:Open Source License

/**
 * HDFSSink????<br>/*w w  w  .  ja v  a2  s  .co  m*/
 * ?????????????????
 * 
 * @param hdfs 
 * @param baseUrl URL
 * @param baseName ??
 * @param tmpSuffix ??
 */
public static void execute(FileSystem hdfs, String baseUrl, String baseName, String tmpSuffix) {
    String baseRealUrl = baseUrl;

    if (baseRealUrl.endsWith("/") == false) {
        baseRealUrl = baseRealUrl + "/";
    }

    String targetPattern = baseRealUrl + baseName + "[0-9]*" + tmpSuffix + "*";
    Path targetPathPattern = new Path(targetPattern);

    FileStatus[] targetTmpFiles = null;

    try {
        targetTmpFiles = hdfs.globStatus(targetPathPattern);
    } catch (IOException ioex) {
        logger.warn("Failed to search preprocess target files. Skip preprocess.", ioex);
        return;
    }

    if (targetTmpFiles.length == 0) {
        String logFormat = "Preprocess target files not exist. Path={0}";
        String logMessage = MessageFormat.format(logFormat, targetPattern);
        logger.info(logMessage);
        return;
    }

    if (logger.isInfoEnabled() == true) {
        printTargetPathList(targetTmpFiles);
    }

    for (FileStatus targetTmpFile : targetTmpFiles) {
        renameTmpFile(hdfs, targetTmpFile.getPath().toString(), tmpSuffix);
    }

}

From source file:bigfat.hadoop.HDFSDirInputStream.java

License:Apache License

/**
 * Create a input stream that will read through all the files in one
 * directory note that the file will be sorted by name, using the
 * comparator.//from   w w  w  .j  a  v a  2 s  .c om
 * 
 * @param fs
 * @param dir
 * @param comp
 * @throws IOException
 */
public HDFSDirInputStream(FileSystem fs, String dir, Comparator<String> comp) throws IOException {
    this.fs = fs;
    Path p = new Path(dir);
    FileStatus fstate = fs.getFileStatus(p);
    if (fstate.isDir()) {
        FileStatus[] child = fs.globStatus(new Path(dir + "/*"));
        LinkedList<String> s = new LinkedList<String>();
        Map<String, Path> map = new HashMap<String, Path>();
        for (FileStatus c : child) {
            if (c.isDir())
                continue;
            map.put(c.getPath().getName(), c.getPath());
            s.add(c.getPath().getName());
        }
        if (comp != null)
            Collections.sort(s, comp);
        else
            Collections.sort(s);
        Iterator<String> it = s.iterator();
        while (it.hasNext()) {
            String n = it.next();
            Path pr = map.get(n);
            this.appendFile(pr.toString());
        }
    } else {
        this.appendFile(dir);
    }
}

From source file:cascading.tap.GlobHfs.java

License:Open Source License

private Tap[] makeTaps(JobConf conf) throws IOException {
    FileStatus[] statusList = null;/*  w ww.  jav a  2 s  .co  m*/

    Path path = new Path(pathPattern);

    FileSystem fileSystem = path.getFileSystem(conf);

    if (pathFilter == null)
        statusList = fileSystem.globStatus(path);
    else
        statusList = fileSystem.globStatus(path, pathFilter);

    if (statusList == null || statusList.length == 0)
        throw new TapException("unable to find paths matching path pattern: " + pathPattern);

    List<Hfs> notEmpty = new ArrayList<Hfs>();

    for (int i = 0; i < statusList.length; i++) {
        // remove empty files. turns out a directory returns a length not zero
        // so this jives with the expectations set in the above javadoc
        if (statusList[i].getLen() != 0)
            notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString()));
    }

    if (notEmpty.isEmpty())
        throw new TapException("all paths matching path pattern are zero length: " + pathPattern);

    return notEmpty.toArray(new Tap[notEmpty.size()]);
}

From source file:cascading.tap.hadoop.BaseDistCacheTap.java

License:Open Source License

@Override
public TupleEntryIterator openForRead(FlowProcess<? extends Configuration> flowProcess, RecordReader input)
        throws IOException {
    // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is provided
    if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) {
        LOG.info("delegating to parent");
        return super.openForRead(flowProcess, input);
    }//from ww  w  .  java  2  s . com

    Path[] cachedFiles = getLocalCacheFiles(flowProcess);

    if (cachedFiles == null || cachedFiles.length == 0)
        return super.openForRead(flowProcess, null);

    List<Path> paths = new ArrayList<>();
    List<Tap> taps = new ArrayList<>();

    if (isSimpleGlob()) {
        FileSystem fs = FileSystem.get(flowProcess.getConfig());
        FileStatus[] statuses = fs.globStatus(getHfs().getPath());

        for (FileStatus status : statuses)
            paths.add(status.getPath());
    } else {
        paths.add(getHfs().getPath());
    }

    for (Path pathToFind : paths) {
        for (Path path : cachedFiles) {
            if (path.toString().endsWith(pathToFind.getName())) {
                LOG.info("found {} in distributed cache", path);
                taps.add(new Lfs(getScheme(), path.toString()));
            }
        }
    }

    if (paths.isEmpty()) // not in cache, read from HDFS
    {
        LOG.info("could not find files in local resource path. delegating to parent: {}",
                super.getIdentifier());
        return super.openForRead(flowProcess, input);
    }

    return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input);
}

From source file:cascading.tap.hadoop.BaseDistCacheTap.java

License:Open Source License

private void registerHfs(FlowProcess<? extends Configuration> process, Configuration conf, Hfs hfs)
        throws IOException {
    if (isSimpleGlob()) {
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] statuses = fs.globStatus(getHfs().getPath());

        if (statuses == null || statuses.length == 0)
            throw new TapException(String.format(
                    "glob expression %s does not match any files on the filesystem", getHfs().getPath()));

        for (FileStatus fileStatus : statuses)
            registerURI(conf, fileStatus.getPath());
    } else {/*from  w ww  . j a  va2 s. co m*/
        registerURI(conf, hfs.getPath());
    }

    hfs.sourceConfInitComplete(process, conf);
}

From source file:cascading.tap.hadoop.GlobHfs.java

License:Open Source License

private Hfs[] makeTaps(Configuration conf) throws IOException {
    FileStatus[] statusList;/*from  www. ja v  a2 s .c  o m*/

    Path path = new Path(pathPattern);

    FileSystem fileSystem = path.getFileSystem(conf);

    if (pathFilter == null)
        statusList = fileSystem.globStatus(path);
    else
        statusList = fileSystem.globStatus(path, pathFilter);

    if (statusList == null || statusList.length == 0)
        throw new TapException("unable to find paths matching path pattern: " + pathPattern);

    List<Hfs> notEmpty = new ArrayList<Hfs>();

    for (int i = 0; i < statusList.length; i++) {
        // remove empty files. some hadoop versions return non-zero for dirs
        // so this jives with the expectations set in the above javadoc
        if (statusList[i].isDir() || statusList[i].getLen() != 0)
            notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString()));
    }

    if (notEmpty.isEmpty())
        throw new TapException(
                "all paths matching path pattern are zero length and not directories: " + pathPattern);

    return notEmpty.toArray(new Hfs[notEmpty.size()]);
}

From source file:com.anhth12.lambda.BatchUpdateFunction.java

@Override
public Void call(JavaPairRDD<K, M> newData, Time timestamp) throws Exception {
    if (newData.take(1).isEmpty()) {
        log.info("No data in current generation's RDD; nothing to do");
        return null;
    }/*from w  w  w . j  a  va2 s .  co m*/

    log.info("Beginning update at {}", timestamp);

    Configuration hadoopConf = sparkContext.hadoopConfiguration();

    JavaPairRDD<K, M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);

    if (inputPathStatuses == null || inputPathStatuses.length == 0) {
        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;
    } else {
        log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));
        JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass,
                        messageWritableClass);
        pastData = pastWriteableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));

    }

    try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) {
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString,
                producer);
    }
    return null;
}

From source file:com.anhth12.lambda.BatchUpdateFunction2.java

@Override
public Void call(JavaRDD<MessageAndMetadata> newData, Time timestamp) throws Exception {
    if (newData.take(1).isEmpty()) {
        log.info("No data in current generation's RDD; nothing to do");
        return null;
    }/*from   w  w w  .  jav a 2 s.c  om*/

    log.info("Beginning update at {}", timestamp);

    JavaPairRDD<K, M> newDataKM = newData.mapToPair(new PairFunction<MessageAndMetadata, K, M>() {

        @Override
        public Tuple2<K, M> call(MessageAndMetadata t) throws Exception {

            return (Tuple2<K, M>) new Tuple2<>(new String(t.getKey()), new String(t.getPayload()));
        }
    });

    Configuration hadoopConf = sparkContext.hadoopConfiguration();

    JavaPairRDD<K, M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);

    if (inputPathStatuses == null || inputPathStatuses.length == 0) {
        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;
    } else {
        log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));
        JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass,
                        messageWritableClass);
        pastData = pastWriteableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));

    }
    try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) {
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newDataKM, pastData, modelDirString,
                producer);
    }
    return null;

}

From source file:com.asakusafw.bulkloader.cache.CacheBuildTest.java

License:Apache License

private List<TestDataModel> collect(CacheStorage storage, Path contents) throws IOException {
    List<TestDataModel> results = new ArrayList<>();
    FileSystem fs = storage.getFileSystem();
    for (FileStatus status : fs.globStatus(contents)) {
        results.addAll(collectContent(fs, status));
    }/*from w ww. j ava  2s.  c om*/
    Collections.sort(results);
    return results;
}

From source file:com.asakusafw.bulkloader.collector.ExportFileSend.java

License:Apache License

/**
 * ????TSV??/*from   w ww.  j av  a  2 s.  c  o  m*/
 * {@link com.asakusafw.bulkloader.transfer.FileList.Writer}????
 * @param <T> ?
 * @param targetTableModel Export??Model?
 * @param filePath Export
 * @param writer ?Writer
 * @param tableName ??
 * @return ?????????????????? -1
 * @throws BulkLoaderSystemException ??????
 */
protected <T extends Writable> long send(Class<T> targetTableModel, String filePath, FileList.Writer writer,
        String tableName) throws BulkLoaderSystemException {
    FileSystem fs = null;
    String fileName = null;

    // ??
    long maxSize = Long.parseLong(ConfigurationLoader.getProperty(Constants.PROP_KEY_EXP_LOAD_MAX_SIZE));

    try {
        TsvIoFactory<T> factory = new TsvIoFactory<>(targetTableModel);
        Configuration conf = new Configuration();
        fs = FileSystem.get(new URI(filePath), conf);

        // ?????
        FileStatus[] status = fs.globStatus(new Path(filePath));
        Path[] listedPaths = FileUtil.stat2Paths(status);
        if (listedPaths == null) {
            LOG.info("TG-COLLECTOR-02006", tableName, filePath);
            return -1;
        } else {
            LOG.info("TG-COLLECTOR-02007", listedPaths.length, tableName, filePath);
        }
        long count = 0;
        boolean addEntry = false;
        for (Path path : listedPaths) {
            // ?????
            if (isSystemFile(path)) {
                continue;
            }

            // TODO ????
            // ??????
            ModelInput<T> input = TemporaryStorage.openInput(conf, targetTableModel, path);
            try {
                while (true) {
                    // 
                    addEntry = true;
                    fileName = FileNameUtil.createSendExportFileName(tableName, fileNameMap);
                    OutputStream output = writer.openNext(FileList.content(fileName));
                    try {
                        CountingOutputStream counter = new CountingOutputStream(output);
                        ModelOutput<T> modelOut = factory.createModelOutput(counter);
                        T model = factory.createModelObject();
                        LOG.info("TG-COLLECTOR-02004", tableName, path.toString(), fileName);

                        // ???ModelTSV??
                        boolean nextFile = false;
                        while (input.readTo(model)) {
                            // Modol???
                            modelOut.write(model);
                            count++;
                            // ???????
                            // char?byte?????????
                            // ??????(????)
                            if (counter.getByteCount() > maxSize) {
                                nextFile = true;
                                break;
                            }
                        }
                        modelOut.close();
                        LOG.info("TG-COLLECTOR-02005", tableName, path.toString(), fileName);

                        if (nextFile) {
                            // ???????
                            continue;
                        } else {
                            // ????????
                            break;
                        }
                    } finally {
                        output.close();
                    }
                }
            } finally {
                input.close();
            }
        }
        if (addEntry) {
            return count;
        } else {
            assert count == 0;
            return -1;
        }
    } catch (IOException e) {
        throw new BulkLoaderSystemException(e, getClass(), "TG-COLLECTOR-02001", MessageFormat
                .format("HDFS?{0} ???{1}", filePath, fileName));
    } catch (URISyntaxException e) {
        throw new BulkLoaderSystemException(e, getClass(), "TG-COLLECTOR-02001",
                MessageFormat.format("HDFS???HDFS?{0}", filePath));
    } finally {
        if (fs != null) {
            try {
                fs.close();
            } catch (IOException e) {
                throw new BulkLoaderSystemException(e, this.getClass(), "TG-COLLECTOR-02001",
                        MessageFormat.format(
                                "HDFS???URI{0}",
                                filePath));
            }
        }
    }
}