Example usage for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:ml.shifu.shifu.core.processor.PostTrainModelProcessor.java

License:Apache License

private List<Integer> getFeatureImportance(SourceType source, String output) throws IOException {
    List<Integer> featureImportance = new ArrayList<Integer>();
    List<Scanner> scanners = null;
    try {//w  w w. j av  a  2  s  .c  o m
        scanners = ShifuFileUtils.getDataScanners(output, source, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.toString().contains("part-r-");
            }
        });

        for (Scanner scanner : scanners) {
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine().trim();
                String[] keyValues = line.split("\t");
                String key = keyValues[0];
                featureImportance.add(Integer.parseInt(key));
            }
        }
    } finally {
        // release
        closeScanners(scanners);
    }
    return featureImportance;
}

From source file:ml.shifu.shifu.fs.ShifuFileUtils.java

License:Apache License

/**
 * Get the data scanners for some specified path
 * if the file is directory, get all scanner of normal sub-files
 * if the file is normal  file, get its scanner
 * !!! Notice, all hidden files (file name start with ".") will be skipped
 * !!! Warning: scanner instances should be closed by caller.
 *
 * @param path       - file path to get the scanner
 * @param sourceType - local/hdfs//w  w  w .j  a va2 s . c  o  m
 * @return scanners for specified path
 * @throws IOException -  if any I/O exception in processing
 */
public static List<Scanner> getDataScanners(String path, SourceType sourceType) throws IOException {
    FileSystem fs = getFileSystemBySourceType(sourceType);

    FileStatus[] listStatus;
    Path p = new Path(path);
    if (fs.getFileStatus(p).isDir()) {
        // for folder we need filter pig header files
        listStatus = fs.listStatus(p, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return (!path.getName().startsWith(Constants.HIDDEN_FILES));
            }
        });
    } else {
        listStatus = new FileStatus[] { fs.getFileStatus(p) };
    }

    List<Scanner> scanners = new ArrayList<Scanner>();
    for (FileStatus f : listStatus) {
        String filename = f.getPath().getName();

        if (f.isDir()) {
            log.warn("Skip - {}, since it's direcory, please check your configuration.", filename);
            continue;
        }

        log.debug("Creating Scanner for file: {} ", filename);
        if (filename.endsWith(Constants.GZ_SUFFIX)) {
            scanners.add(new Scanner(new GZIPInputStream(fs.open(f.getPath()))));
        } else if (filename.endsWith(Constants.BZ2_SUFFIX)) {
            scanners.add(new Scanner(new BZip2CompressorInputStream(fs.open(f.getPath()))));
        } else {
            scanners.add(new Scanner(new BufferedInputStream(fs.open(f.getPath()))));
        }
    }

    return scanners;
}

From source file:net.sf.katta.operation.master.IndexDeployOperation.java

License:Apache License

protected static List<Shard> readShardsFromFs(final String indexName, final String indexPathString)
        throws IndexDeployException {
    // get shard folders from source
    URI uri;//from  w w w  . ja v a  2  s . co m
    try {
        uri = new URI(indexPathString);
    } catch (final URISyntaxException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to parse index path uri '"
                + indexPathString + "', make sure it starts with file:// or hdfs:// ", e);
    }
    FileSystem fileSystem;
    try {
        fileSystem = HadoopUtil.getFileSystem(new Path(uri.toString()));
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "unable to retrive file system for index path '" + indexPathString
                        + "', make sure your path starts with hadoop support prefix like file:// or hdfs://",
                e);
    }

    List<Shard> shards = new ArrayList<Shard>();
    try {
        final Path indexPath = new Path(indexPathString);
        if (!fileSystem.exists(indexPath)) {
            throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                    "index path '" + uri + "' does not exists");
        }
        final FileStatus[] listStatus = fileSystem.listStatus(indexPath, new PathFilter() {
            public boolean accept(final Path aPath) {
                return !aPath.getName().startsWith(".");
            }
        });
        for (final FileStatus fileStatus : listStatus) {
            String shardPath = fileStatus.getPath().toString();
            if (fileStatus.isDir() || shardPath.endsWith(".zip")) {
                shards.add(new Shard(createShardName(indexName, shardPath), shardPath));
            }
        }
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "could not access index path: " + indexPathString, e);
    }

    if (shards.size() == 0) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index does not contain any shard");
    }
    return shards;
}

From source file:net.team1.dev.HousingAnalysis.java

License:Apache License

/**
 * Walk through the input directory to get all the input files and construct the file path and mapper class dictionary.
 *
 * @param inputDir The path of the input directory
 * @param fs       The HDFS file system object.
 * @return A {@link HashMap} whose keys are the file path and values are the corresponding mapper classes.
 * @throws IOException IOException//  w  w w  .  j a  v a 2s  .  c o m
 */
private static HashMap<Path, Class<? extends Mapper>> getInputFilePaths(Path inputDir, FileSystem fs)
        throws IOException {
    HashMap<Path, Class<? extends Mapper>> mappers = new HashMap<>();
    FileStatus files[] = fs.listStatus(inputDir, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            String name = path.getName();
            return name.endsWith(".txt") && name.startsWith("thads");
        }
    });
    for (FileStatus f : files) {
        Path p = f.getPath();
        String fileName = p.getName();
        if (fileName.contains("2013"))
            mappers.put(p, Mapper2013.class);
        else if (fileName.contains("2003"))
            mappers.put(p, Mapper2003.class);
        else if (fileName.contains("2005"))
            mappers.put(p, Mapper2005.class);
        else if (fileName.contains("2007"))
            mappers.put(p, Mapper2007.class);
        else if (fileName.contains("2009"))
            mappers.put(p, Mapper2009.class);
        else if (fileName.contains("2011"))
            mappers.put(p, Mapper2011.class);
    }
    return mappers;
}

From source file:org.apache.blur.analysis.HdfsFieldManager.java

License:Apache License

@Override
protected List<String> getFieldNamesToLoad() throws IOException {
    Tracer trace = Trace.trace("filesystem - getFieldNamesToLoad", Trace.param("storagePath", _storagePath));
    try {//from w ww.  jav a  2  s .c  o  m
        if (!_fileSystem.exists(_storagePath)) {
            return EMPTY_LIST;
        }
        FileStatus[] listStatus = _fileSystem.listStatus(_storagePath, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                if (path.getName().endsWith(TYPE_FILE_EXT)) {
                    return true;
                }
                return false;
            }
        });
        if (listStatus == null) {
            return EMPTY_LIST;
        }
        List<String> fieldNames = new ArrayList<String>();
        for (FileStatus fileStatus : listStatus) {
            if (!fileStatus.isDir()) {
                String fileName = fileStatus.getPath().getName();
                fieldNames.add(fileName.substring(0, fileName.lastIndexOf(TYPE_FILE_EXT)));
            }
        }
        return fieldNames;
    } finally {
        trace.done();
    }
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

public long getSegmentImportPendingCount() throws IOException {
    Path path = _shardContext.getHdfsDirPath();
    Configuration configuration = _shardContext.getTableContext().getConfiguration();
    FileSystem fileSystem = path.getFileSystem(configuration);
    for (int i = 0; i < 10; i++) {
        try {//from   w  w w  .j a v  a2  s .  co  m
            FileStatus[] listStatus = fileSystem.listStatus(path, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    if (path != null && path.getName().endsWith(COMMIT)) {
                        return true;
                    }
                    return false;
                }
            });
            return listStatus.length;
        } catch (FileNotFoundException e) {
            LOG.warn("File not found error, retrying.");
        }
        try {
            Thread.sleep(100);
        } catch (InterruptedException e) {
            return 0L;
        }
    }
    throw new IOException("Received too many errors. Give up.");
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

public long getSegmentImportInProgressCount() throws IOException {
    Path path = _shardContext.getHdfsDirPath();
    Configuration configuration = _shardContext.getTableContext().getConfiguration();
    FileSystem fileSystem = path.getFileSystem(configuration);
    for (int i = 0; i < 10; i++) {
        try {//from w w  w . j a  v  a 2 s  .  co m
            FileStatus[] listStatus = fileSystem.listStatus(path, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    if (path != null && path.getName().endsWith(INUSE)) {
                        return true;
                    }
                    return false;
                }
            });
            long count = 0;
            for (FileStatus fileStatus : listStatus) {
                Path p = fileStatus.getPath();
                if (fileSystem.exists(new Path(p, INPROGRESS))) {
                    count++;
                }
            }
            return count;
        } catch (FileNotFoundException e) {
            LOG.warn("File not found error, retrying.");
        }
        try {
            Thread.sleep(100);
        } catch (InterruptedException e) {
            return 0L;
        }
    }
    throw new IOException("Received too many errors. Give up.");
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

@Override
public void run() {
    // Only allow one import to occur in the process at a time.
    _globalLock.lock();//from  w w  w .j ava2s .  co m
    try {
        if (_lastCleanup + _cleanupDelay < System.currentTimeMillis()) {
            try {
                cleanupOldDirs();
            } catch (IOException e) {
                LOG.error("Unknown error while trying to clean old directories on [{1}/{2}].", e, _shard,
                        _table);
            }
            _lastCleanup = System.currentTimeMillis();
        }
        Path path = _shardContext.getHdfsDirPath();
        Configuration configuration = _shardContext.getTableContext().getConfiguration();
        try {
            FileSystem fileSystem = path.getFileSystem(configuration);
            SortedSet<FileStatus> listStatus;
            while (true) {
                try {
                    listStatus = sort(fileSystem.listStatus(path, new PathFilter() {
                        @Override
                        public boolean accept(Path path) {
                            if (path != null && path.getName().endsWith(COMMIT)) {
                                return true;
                            }
                            return false;
                        }
                    }));
                    break;
                } catch (FileNotFoundException e) {
                    LOG.warn("File not found error, retrying.");
                }
                try {
                    Thread.sleep(100);
                } catch (InterruptedException e) {
                    return;
                }
            }
            for (FileStatus fileStatus : listStatus) {
                Path file = fileStatus.getPath();
                if (fileStatus.isDir() && file.getName().endsWith(COMMIT)) {
                    // rename to inuse, if good continue else rename to badindex
                    Path inuse = new Path(file.getParent(), rename(file.getName(), INUSE));
                    touch(fileSystem, new Path(file, INPROGRESS));
                    if (fileSystem.rename(file, inuse)) {
                        if (_testError != null) {
                            _testError.run();
                        }
                        HdfsDirectory hdfsDirectory = new HdfsDirectory(configuration, inuse);
                        try {
                            if (DirectoryReader.indexExists(hdfsDirectory)) {
                                IndexAction indexAction = getIndexAction(hdfsDirectory, fileSystem);
                                _blurIndex.process(indexAction);
                                return;
                            } else {
                                Path badindex = new Path(file.getParent(), rename(file.getName(), BADINDEX));
                                if (fileSystem.rename(inuse, badindex)) {
                                    LOG.error(
                                            "Directory found at [{0}] is not a vaild index, renaming to [{1}].",
                                            inuse, badindex);
                                } else {
                                    LOG.fatal(
                                            "Directory found at [{0}] is not a vaild index, could not rename to [{1}].",
                                            inuse, badindex);
                                }
                            }
                        } finally {
                            hdfsDirectory.close();
                        }
                    } else {
                        LOG.fatal("Could not rename [{0}] to inuse dir.", file);
                    }
                }
            }
        } catch (IOException e) {
            LOG.error("Unknown error while trying to refresh imports on [{1}/{2}].", e, _shard, _table);
        }
    } finally {
        _globalLock.unlock();
    }
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

public void cleanupOldDirs() throws IOException {
    Path hdfsDirPath = _shardContext.getHdfsDirPath();
    TableContext tableContext = _shardContext.getTableContext();
    Configuration configuration = tableContext.getConfiguration();
    FileSystem fileSystem = hdfsDirPath.getFileSystem(configuration);
    FileStatus[] inuseSubDirs = fileSystem.listStatus(hdfsDirPath, new PathFilter() {
        @Override//from www . jav  a 2  s. c  o  m
        public boolean accept(Path path) {
            return path.getName().endsWith(INUSE);
        }
    });
    Set<Path> inuseDirs = toSet(inuseSubDirs);
    Map<Path, Path> inuseFileToDir = toMap(fileSystem, inuseDirs);
    FileStatus[] listStatus = fileSystem.listStatus(hdfsDirPath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(HdfsDirectory.LNK);
        }
    });

    for (FileStatus status : listStatus) {
        Path realPath = HdfsDirectory.readRealPathDataFromSymlinkPath(fileSystem, status.getPath());
        Path inuseDir = inuseFileToDir.get(realPath);
        inuseDirs.remove(inuseDir);
        // if the inuse dir has an inprogress file then remove it because there
        // are files that reference this dir so it had to be committed.
        Path path = new Path(inuseDir, INPROGRESS);
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, false);
            if (_thriftCache != null) {
                _thriftCache.clearTable(_table);
            }
        }
    }

    // Check if any inuse dirs have inprogress files.
    // If they do, rename inuse to commit to retry import.
    for (Path inuse : new HashSet<Path>(inuseDirs)) {
        Path path = new Path(inuse, INPROGRESS);
        if (fileSystem.exists(path)) {
            LOG.info("Path [{0}] is not imported but has inprogress file, retrying import.", path);
            inuseDirs.remove(inuse);
            Path commit = new Path(inuse.getParent(), rename(inuse.getName(), COMMIT));
            fileSystem.rename(inuse, commit);
        }
    }

    for (Path p : inuseDirs) {
        LOG.info("Deleting path [{0}] no longer in use.", p);
        fileSystem.delete(p, true);
    }
}

From source file:org.apache.blur.mapreduce.lib.BlurInputFormat.java

License:Apache License

private static List<BlurInputSplit> getSegmentSplits(final Path dir, ExecutorService service,
        final Configuration configuration, final Text table, final Text snapshot) throws IOException {

    FileSystem fileSystem = dir.getFileSystem(configuration);
    FileStatus[] shardDirs = fileSystem.listStatus(dir, new PathFilter() {
        @Override//from  ww  w .ja va 2  s.  com
        public boolean accept(Path path) {
            return path.getName().startsWith(BlurConstants.SHARD_PREFIX);
        }
    });

    List<Future<List<BlurInputSplit>>> futures = new ArrayList<Future<List<BlurInputSplit>>>();
    for (final FileStatus shardFileStatus : shardDirs) {
        futures.add(service.submit(new Callable<List<BlurInputSplit>>() {
            @Override
            public List<BlurInputSplit> call() throws Exception {
                return getSegmentSplits(shardFileStatus.getPath(), configuration, table, snapshot);
            }
        }));
    }

    List<BlurInputSplit> results = new ArrayList<BlurInputSplit>();
    for (Future<List<BlurInputSplit>> future : futures) {
        try {
            results.addAll(future.get());
        } catch (InterruptedException e) {
            throw new IOException(e);
        } catch (ExecutionException e) {
            Throwable cause = e.getCause();
            if (cause instanceof IOException) {
                throw (IOException) cause;
            } else {
                throw new IOException(cause);
            }
        }
    }
    return results;
}