Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:ml.shifu.shifu.core.processor.PostTrainModelProcessor.java

License:Apache License

private List<Integer> getFeatureImportance(SourceType source, String output) throws IOException {
    List<Integer> featureImportance = new ArrayList<Integer>();
    List<Scanner> scanners = null;
    try {//w  w w. j av  a  2  s  .c  o m
        scanners = ShifuFileUtils.getDataScanners(output, source, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.toString().contains("part-r-");
            }
        });

        for (Scanner scanner : scanners) {
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine().trim();
                String[] keyValues = line.split("\t");
                String key = keyValues[0];
                featureImportance.add(Integer.parseInt(key));
            }
        }
    } finally {
        // release
        closeScanners(scanners);
    }
    return featureImportance;
}

From source file:ml.shifu.shifu.fs.ShifuFileUtils.java

License:Apache License

/**
 * Get the data scanners for some specified path
 * if the file is directory, get all scanner of normal sub-files
 * if the file is normal  file, get its scanner
 * !!! Notice, all hidden files (file name start with ".") will be skipped
 * !!! Warning: scanner instances should be closed by caller.
 *
 * @param path       - file path to get the scanner
 * @param sourceType - local/hdfs//w  w  w .j  a va2 s . c  o  m
 * @return scanners for specified path
 * @throws IOException -  if any I/O exception in processing
 */
public static List<Scanner> getDataScanners(String path, SourceType sourceType) throws IOException {
    FileSystem fs = getFileSystemBySourceType(sourceType);

    FileStatus[] listStatus;
    Path p = new Path(path);
    if (fs.getFileStatus(p).isDir()) {
        // for folder we need filter pig header files
        listStatus = fs.listStatus(p, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return (!path.getName().startsWith(Constants.HIDDEN_FILES));
            }
        });
    } else {
        listStatus = new FileStatus[] { fs.getFileStatus(p) };
    }

    List<Scanner> scanners = new ArrayList<Scanner>();
    for (FileStatus f : listStatus) {
        String filename = f.getPath().getName();

        if (f.isDir()) {
            log.warn("Skip - {}, since it's direcory, please check your configuration.", filename);
            continue;
        }

        log.debug("Creating Scanner for file: {} ", filename);
        if (filename.endsWith(Constants.GZ_SUFFIX)) {
            scanners.add(new Scanner(new GZIPInputStream(fs.open(f.getPath()))));
        } else if (filename.endsWith(Constants.BZ2_SUFFIX)) {
            scanners.add(new Scanner(new BZip2CompressorInputStream(fs.open(f.getPath()))));
        } else {
            scanners.add(new Scanner(new BufferedInputStream(fs.open(f.getPath()))));
        }
    }

    return scanners;
}

From source file:net.sf.katta.operation.master.IndexDeployOperation.java

License:Apache License

protected static List<Shard> readShardsFromFs(final String indexName, final String indexPathString)
        throws IndexDeployException {
    // get shard folders from source
    URI uri;//from  w w w  . ja v a  2  s . co m
    try {
        uri = new URI(indexPathString);
    } catch (final URISyntaxException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to parse index path uri '"
                + indexPathString + "', make sure it starts with file:// or hdfs:// ", e);
    }
    FileSystem fileSystem;
    try {
        fileSystem = HadoopUtil.getFileSystem(new Path(uri.toString()));
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "unable to retrive file system for index path '" + indexPathString
                        + "', make sure your path starts with hadoop support prefix like file:// or hdfs://",
                e);
    }

    List<Shard> shards = new ArrayList<Shard>();
    try {
        final Path indexPath = new Path(indexPathString);
        if (!fileSystem.exists(indexPath)) {
            throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                    "index path '" + uri + "' does not exists");
        }
        final FileStatus[] listStatus = fileSystem.listStatus(indexPath, new PathFilter() {
            public boolean accept(final Path aPath) {
                return !aPath.getName().startsWith(".");
            }
        });
        for (final FileStatus fileStatus : listStatus) {
            String shardPath = fileStatus.getPath().toString();
            if (fileStatus.isDir() || shardPath.endsWith(".zip")) {
                shards.add(new Shard(createShardName(indexName, shardPath), shardPath));
            }
        }
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "could not access index path: " + indexPathString, e);
    }

    if (shards.size() == 0) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index does not contain any shard");
    }
    return shards;
}

From source file:net.team1.dev.HousingAnalysis.java

License:Apache License

/**
 * Walk through the input directory to get all the input files and construct the file path and mapper class dictionary.
 *
 * @param inputDir The path of the input directory
 * @param fs       The HDFS file system object.
 * @return A {@link HashMap} whose keys are the file path and values are the corresponding mapper classes.
 * @throws IOException IOException//  w  w w  .  j a  v a 2s  .  c o m
 */
private static HashMap<Path, Class<? extends Mapper>> getInputFilePaths(Path inputDir, FileSystem fs)
        throws IOException {
    HashMap<Path, Class<? extends Mapper>> mappers = new HashMap<>();
    FileStatus files[] = fs.listStatus(inputDir, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            String name = path.getName();
            return name.endsWith(".txt") && name.startsWith("thads");
        }
    });
    for (FileStatus f : files) {
        Path p = f.getPath();
        String fileName = p.getName();
        if (fileName.contains("2013"))
            mappers.put(p, Mapper2013.class);
        else if (fileName.contains("2003"))
            mappers.put(p, Mapper2003.class);
        else if (fileName.contains("2005"))
            mappers.put(p, Mapper2005.class);
        else if (fileName.contains("2007"))
            mappers.put(p, Mapper2007.class);
        else if (fileName.contains("2009"))
            mappers.put(p, Mapper2009.class);
        else if (fileName.contains("2011"))
            mappers.put(p, Mapper2011.class);
    }
    return mappers;
}

From source file:org.apache.blur.analysis.HdfsFieldManager.java

License:Apache License

@Override
protected List<String> getFieldNamesToLoad() throws IOException {
    Tracer trace = Trace.trace("filesystem - getFieldNamesToLoad", Trace.param("storagePath", _storagePath));
    try {//from w ww.  jav a  2  s .c  o  m
        if (!_fileSystem.exists(_storagePath)) {
            return EMPTY_LIST;
        }
        FileStatus[] listStatus = _fileSystem.listStatus(_storagePath, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                if (path.getName().endsWith(TYPE_FILE_EXT)) {
                    return true;
                }
                return false;
            }
        });
        if (listStatus == null) {
            return EMPTY_LIST;
        }
        List<String> fieldNames = new ArrayList<String>();
        for (FileStatus fileStatus : listStatus) {
            if (!fileStatus.isDir()) {
                String fileName = fileStatus.getPath().getName();
                fieldNames.add(fileName.substring(0, fileName.lastIndexOf(TYPE_FILE_EXT)));
            }
        }
        return fieldNames;
    } finally {
        trace.done();
    }
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

public long getSegmentImportPendingCount() throws IOException {
    Path path = _shardContext.getHdfsDirPath();
    Configuration configuration = _shardContext.getTableContext().getConfiguration();
    FileSystem fileSystem = path.getFileSystem(configuration);
    for (int i = 0; i < 10; i++) {
        try {//from   w  w w  .j a v  a2  s .  co  m
            FileStatus[] listStatus = fileSystem.listStatus(path, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    if (path != null && path.getName().endsWith(COMMIT)) {
                        return true;
                    }
                    return false;
                }
            });
            return listStatus.length;
        } catch (FileNotFoundException e) {
            LOG.warn("File not found error, retrying.");
        }
        try {
            Thread.sleep(100);
        } catch (InterruptedException e) {
            return 0L;
        }
    }
    throw new IOException("Received too many errors. Give up.");
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

public long getSegmentImportInProgressCount() throws IOException {
    Path path = _shardContext.getHdfsDirPath();
    Configuration configuration = _shardContext.getTableContext().getConfiguration();
    FileSystem fileSystem = path.getFileSystem(configuration);
    for (int i = 0; i < 10; i++) {
        try {//from w w  w . j a  v  a 2 s  .  co m
            FileStatus[] listStatus = fileSystem.listStatus(path, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    if (path != null && path.getName().endsWith(INUSE)) {
                        return true;
                    }
                    return false;
                }
            });
            long count = 0;
            for (FileStatus fileStatus : listStatus) {
                Path p = fileStatus.getPath();
                if (fileSystem.exists(new Path(p, INPROGRESS))) {
                    count++;
                }
            }
            return count;
        } catch (FileNotFoundException e) {
            LOG.warn("File not found error, retrying.");
        }
        try {
            Thread.sleep(100);
        } catch (InterruptedException e) {
            return 0L;
        }
    }
    throw new IOException("Received too many errors. Give up.");
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

@Override
public void run() {
    // Only allow one import to occur in the process at a time.
    _globalLock.lock();//from  w w  w .j ava2s .  co m
    try {
        if (_lastCleanup + _cleanupDelay < System.currentTimeMillis()) {
            try {
                cleanupOldDirs();
            } catch (IOException e) {
                LOG.error("Unknown error while trying to clean old directories on [{1}/{2}].", e, _shard,
                        _table);
            }
            _lastCleanup = System.currentTimeMillis();
        }
        Path path = _shardContext.getHdfsDirPath();
        Configuration configuration = _shardContext.getTableContext().getConfiguration();
        try {
            FileSystem fileSystem = path.getFileSystem(configuration);
            SortedSet<FileStatus> listStatus;
            while (true) {
                try {
                    listStatus = sort(fileSystem.listStatus(path, new PathFilter() {
                        @Override
                        public boolean accept(Path path) {
                            if (path != null && path.getName().endsWith(COMMIT)) {
                                return true;
                            }
                            return false;
                        }
                    }));
                    break;
                } catch (FileNotFoundException e) {
                    LOG.warn("File not found error, retrying.");
                }
                try {
                    Thread.sleep(100);
                } catch (InterruptedException e) {
                    return;
                }
            }
            for (FileStatus fileStatus : listStatus) {
                Path file = fileStatus.getPath();
                if (fileStatus.isDir() && file.getName().endsWith(COMMIT)) {
                    // rename to inuse, if good continue else rename to badindex
                    Path inuse = new Path(file.getParent(), rename(file.getName(), INUSE));
                    touch(fileSystem, new Path(file, INPROGRESS));
                    if (fileSystem.rename(file, inuse)) {
                        if (_testError != null) {
                            _testError.run();
                        }
                        HdfsDirectory hdfsDirectory = new HdfsDirectory(configuration, inuse);
                        try {
                            if (DirectoryReader.indexExists(hdfsDirectory)) {
                                IndexAction indexAction = getIndexAction(hdfsDirectory, fileSystem);
                                _blurIndex.process(indexAction);
                                return;
                            } else {
                                Path badindex = new Path(file.getParent(), rename(file.getName(), BADINDEX));
                                if (fileSystem.rename(inuse, badindex)) {
                                    LOG.error(
                                            "Directory found at [{0}] is not a vaild index, renaming to [{1}].",
                                            inuse, badindex);
                                } else {
                                    LOG.fatal(
                                            "Directory found at [{0}] is not a vaild index, could not rename to [{1}].",
                                            inuse, badindex);
                                }
                            }
                        } finally {
                            hdfsDirectory.close();
                        }
                    } else {
                        LOG.fatal("Could not rename [{0}] to inuse dir.", file);
                    }
                }
            }
        } catch (IOException e) {
            LOG.error("Unknown error while trying to refresh imports on [{1}/{2}].", e, _shard, _table);
        }
    } finally {
        _globalLock.unlock();
    }
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

public void cleanupOldDirs() throws IOException {
    Path hdfsDirPath = _shardContext.getHdfsDirPath();
    TableContext tableContext = _shardContext.getTableContext();
    Configuration configuration = tableContext.getConfiguration();
    FileSystem fileSystem = hdfsDirPath.getFileSystem(configuration);
    FileStatus[] inuseSubDirs = fileSystem.listStatus(hdfsDirPath, new PathFilter() {
        @Override//from www . jav  a 2  s. c  o  m
        public boolean accept(Path path) {
            return path.getName().endsWith(INUSE);
        }
    });
    Set<Path> inuseDirs = toSet(inuseSubDirs);
    Map<Path, Path> inuseFileToDir = toMap(fileSystem, inuseDirs);
    FileStatus[] listStatus = fileSystem.listStatus(hdfsDirPath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(HdfsDirectory.LNK);
        }
    });

    for (FileStatus status : listStatus) {
        Path realPath = HdfsDirectory.readRealPathDataFromSymlinkPath(fileSystem, status.getPath());
        Path inuseDir = inuseFileToDir.get(realPath);
        inuseDirs.remove(inuseDir);
        // if the inuse dir has an inprogress file then remove it because there
        // are files that reference this dir so it had to be committed.
        Path path = new Path(inuseDir, INPROGRESS);
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, false);
            if (_thriftCache != null) {
                _thriftCache.clearTable(_table);
            }
        }
    }

    // Check if any inuse dirs have inprogress files.
    // If they do, rename inuse to commit to retry import.
    for (Path inuse : new HashSet<Path>(inuseDirs)) {
        Path path = new Path(inuse, INPROGRESS);
        if (fileSystem.exists(path)) {
            LOG.info("Path [{0}] is not imported but has inprogress file, retrying import.", path);
            inuseDirs.remove(inuse);
            Path commit = new Path(inuse.getParent(), rename(inuse.getName(), COMMIT));
            fileSystem.rename(inuse, commit);
        }
    }

    for (Path p : inuseDirs) {
        LOG.info("Deleting path [{0}] no longer in use.", p);
        fileSystem.delete(p, true);
    }
}

From source file:org.apache.blur.mapreduce.lib.BlurInputFormat.java

License:Apache License

private static List<BlurInputSplit> getSegmentSplits(final Path dir, ExecutorService service,
        final Configuration configuration, final Text table, final Text snapshot) throws IOException {

    FileSystem fileSystem = dir.getFileSystem(configuration);
    FileStatus[] shardDirs = fileSystem.listStatus(dir, new PathFilter() {
        @Override//from  ww  w .ja va 2  s.  com
        public boolean accept(Path path) {
            return path.getName().startsWith(BlurConstants.SHARD_PREFIX);
        }
    });

    List<Future<List<BlurInputSplit>>> futures = new ArrayList<Future<List<BlurInputSplit>>>();
    for (final FileStatus shardFileStatus : shardDirs) {
        futures.add(service.submit(new Callable<List<BlurInputSplit>>() {
            @Override
            public List<BlurInputSplit> call() throws Exception {
                return getSegmentSplits(shardFileStatus.getPath(), configuration, table, snapshot);
            }
        }));
    }

    List<BlurInputSplit> results = new ArrayList<BlurInputSplit>();
    for (Future<List<BlurInputSplit>> future : futures) {
        try {
            results.addAll(future.get());
        } catch (InterruptedException e) {
            throw new IOException(e);
        } catch (ExecutionException e) {
            Throwable cause = e.getCause();
            if (cause instanceof IOException) {
                throw (IOException) cause;
            } else {
                throw new IOException(cause);
            }
        }
    }
    return results;
}