List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:ml.shifu.shifu.core.processor.PostTrainModelProcessor.java
License:Apache License
private List<Integer> getFeatureImportance(SourceType source, String output) throws IOException { List<Integer> featureImportance = new ArrayList<Integer>(); List<Scanner> scanners = null; try {//w w w. j av a 2 s .c o m scanners = ShifuFileUtils.getDataScanners(output, source, new PathFilter() { @Override public boolean accept(Path path) { return path.toString().contains("part-r-"); } }); for (Scanner scanner : scanners) { while (scanner.hasNextLine()) { String line = scanner.nextLine().trim(); String[] keyValues = line.split("\t"); String key = keyValues[0]; featureImportance.add(Integer.parseInt(key)); } } } finally { // release closeScanners(scanners); } return featureImportance; }
From source file:ml.shifu.shifu.fs.ShifuFileUtils.java
License:Apache License
/** * Get the data scanners for some specified path * if the file is directory, get all scanner of normal sub-files * if the file is normal file, get its scanner * !!! Notice, all hidden files (file name start with ".") will be skipped * !!! Warning: scanner instances should be closed by caller. * * @param path - file path to get the scanner * @param sourceType - local/hdfs//w w w .j a va2 s . c o m * @return scanners for specified path * @throws IOException - if any I/O exception in processing */ public static List<Scanner> getDataScanners(String path, SourceType sourceType) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus[] listStatus; Path p = new Path(path); if (fs.getFileStatus(p).isDir()) { // for folder we need filter pig header files listStatus = fs.listStatus(p, new PathFilter() { @Override public boolean accept(Path path) { return (!path.getName().startsWith(Constants.HIDDEN_FILES)); } }); } else { listStatus = new FileStatus[] { fs.getFileStatus(p) }; } List<Scanner> scanners = new ArrayList<Scanner>(); for (FileStatus f : listStatus) { String filename = f.getPath().getName(); if (f.isDir()) { log.warn("Skip - {}, since it's direcory, please check your configuration.", filename); continue; } log.debug("Creating Scanner for file: {} ", filename); if (filename.endsWith(Constants.GZ_SUFFIX)) { scanners.add(new Scanner(new GZIPInputStream(fs.open(f.getPath())))); } else if (filename.endsWith(Constants.BZ2_SUFFIX)) { scanners.add(new Scanner(new BZip2CompressorInputStream(fs.open(f.getPath())))); } else { scanners.add(new Scanner(new BufferedInputStream(fs.open(f.getPath())))); } } return scanners; }
From source file:net.sf.katta.operation.master.IndexDeployOperation.java
License:Apache License
protected static List<Shard> readShardsFromFs(final String indexName, final String indexPathString) throws IndexDeployException { // get shard folders from source URI uri;//from w w w . ja v a 2 s . co m try { uri = new URI(indexPathString); } catch (final URISyntaxException e) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to parse index path uri '" + indexPathString + "', make sure it starts with file:// or hdfs:// ", e); } FileSystem fileSystem; try { fileSystem = HadoopUtil.getFileSystem(new Path(uri.toString())); } catch (final IOException e) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to retrive file system for index path '" + indexPathString + "', make sure your path starts with hadoop support prefix like file:// or hdfs://", e); } List<Shard> shards = new ArrayList<Shard>(); try { final Path indexPath = new Path(indexPathString); if (!fileSystem.exists(indexPath)) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index path '" + uri + "' does not exists"); } final FileStatus[] listStatus = fileSystem.listStatus(indexPath, new PathFilter() { public boolean accept(final Path aPath) { return !aPath.getName().startsWith("."); } }); for (final FileStatus fileStatus : listStatus) { String shardPath = fileStatus.getPath().toString(); if (fileStatus.isDir() || shardPath.endsWith(".zip")) { shards.add(new Shard(createShardName(indexName, shardPath), shardPath)); } } } catch (final IOException e) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "could not access index path: " + indexPathString, e); } if (shards.size() == 0) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index does not contain any shard"); } return shards; }
From source file:net.team1.dev.HousingAnalysis.java
License:Apache License
/** * Walk through the input directory to get all the input files and construct the file path and mapper class dictionary. * * @param inputDir The path of the input directory * @param fs The HDFS file system object. * @return A {@link HashMap} whose keys are the file path and values are the corresponding mapper classes. * @throws IOException IOException// w w w . j a v a 2s . c o m */ private static HashMap<Path, Class<? extends Mapper>> getInputFilePaths(Path inputDir, FileSystem fs) throws IOException { HashMap<Path, Class<? extends Mapper>> mappers = new HashMap<>(); FileStatus files[] = fs.listStatus(inputDir, new PathFilter() { @Override public boolean accept(Path path) { String name = path.getName(); return name.endsWith(".txt") && name.startsWith("thads"); } }); for (FileStatus f : files) { Path p = f.getPath(); String fileName = p.getName(); if (fileName.contains("2013")) mappers.put(p, Mapper2013.class); else if (fileName.contains("2003")) mappers.put(p, Mapper2003.class); else if (fileName.contains("2005")) mappers.put(p, Mapper2005.class); else if (fileName.contains("2007")) mappers.put(p, Mapper2007.class); else if (fileName.contains("2009")) mappers.put(p, Mapper2009.class); else if (fileName.contains("2011")) mappers.put(p, Mapper2011.class); } return mappers; }
From source file:org.apache.blur.analysis.HdfsFieldManager.java
License:Apache License
@Override protected List<String> getFieldNamesToLoad() throws IOException { Tracer trace = Trace.trace("filesystem - getFieldNamesToLoad", Trace.param("storagePath", _storagePath)); try {//from w ww. jav a 2 s .c o m if (!_fileSystem.exists(_storagePath)) { return EMPTY_LIST; } FileStatus[] listStatus = _fileSystem.listStatus(_storagePath, new PathFilter() { @Override public boolean accept(Path path) { if (path.getName().endsWith(TYPE_FILE_EXT)) { return true; } return false; } }); if (listStatus == null) { return EMPTY_LIST; } List<String> fieldNames = new ArrayList<String>(); for (FileStatus fileStatus : listStatus) { if (!fileStatus.isDir()) { String fileName = fileStatus.getPath().getName(); fieldNames.add(fileName.substring(0, fileName.lastIndexOf(TYPE_FILE_EXT))); } } return fieldNames; } finally { trace.done(); } }
From source file:org.apache.blur.manager.writer.IndexImporter.java
License:Apache License
public long getSegmentImportPendingCount() throws IOException { Path path = _shardContext.getHdfsDirPath(); Configuration configuration = _shardContext.getTableContext().getConfiguration(); FileSystem fileSystem = path.getFileSystem(configuration); for (int i = 0; i < 10; i++) { try {//from w w w .j a v a2 s . co m FileStatus[] listStatus = fileSystem.listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { if (path != null && path.getName().endsWith(COMMIT)) { return true; } return false; } }); return listStatus.length; } catch (FileNotFoundException e) { LOG.warn("File not found error, retrying."); } try { Thread.sleep(100); } catch (InterruptedException e) { return 0L; } } throw new IOException("Received too many errors. Give up."); }
From source file:org.apache.blur.manager.writer.IndexImporter.java
License:Apache License
public long getSegmentImportInProgressCount() throws IOException { Path path = _shardContext.getHdfsDirPath(); Configuration configuration = _shardContext.getTableContext().getConfiguration(); FileSystem fileSystem = path.getFileSystem(configuration); for (int i = 0; i < 10; i++) { try {//from w w w . j a v a 2 s . co m FileStatus[] listStatus = fileSystem.listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { if (path != null && path.getName().endsWith(INUSE)) { return true; } return false; } }); long count = 0; for (FileStatus fileStatus : listStatus) { Path p = fileStatus.getPath(); if (fileSystem.exists(new Path(p, INPROGRESS))) { count++; } } return count; } catch (FileNotFoundException e) { LOG.warn("File not found error, retrying."); } try { Thread.sleep(100); } catch (InterruptedException e) { return 0L; } } throw new IOException("Received too many errors. Give up."); }
From source file:org.apache.blur.manager.writer.IndexImporter.java
License:Apache License
@Override public void run() { // Only allow one import to occur in the process at a time. _globalLock.lock();//from w w w .j ava2s . co m try { if (_lastCleanup + _cleanupDelay < System.currentTimeMillis()) { try { cleanupOldDirs(); } catch (IOException e) { LOG.error("Unknown error while trying to clean old directories on [{1}/{2}].", e, _shard, _table); } _lastCleanup = System.currentTimeMillis(); } Path path = _shardContext.getHdfsDirPath(); Configuration configuration = _shardContext.getTableContext().getConfiguration(); try { FileSystem fileSystem = path.getFileSystem(configuration); SortedSet<FileStatus> listStatus; while (true) { try { listStatus = sort(fileSystem.listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { if (path != null && path.getName().endsWith(COMMIT)) { return true; } return false; } })); break; } catch (FileNotFoundException e) { LOG.warn("File not found error, retrying."); } try { Thread.sleep(100); } catch (InterruptedException e) { return; } } for (FileStatus fileStatus : listStatus) { Path file = fileStatus.getPath(); if (fileStatus.isDir() && file.getName().endsWith(COMMIT)) { // rename to inuse, if good continue else rename to badindex Path inuse = new Path(file.getParent(), rename(file.getName(), INUSE)); touch(fileSystem, new Path(file, INPROGRESS)); if (fileSystem.rename(file, inuse)) { if (_testError != null) { _testError.run(); } HdfsDirectory hdfsDirectory = new HdfsDirectory(configuration, inuse); try { if (DirectoryReader.indexExists(hdfsDirectory)) { IndexAction indexAction = getIndexAction(hdfsDirectory, fileSystem); _blurIndex.process(indexAction); return; } else { Path badindex = new Path(file.getParent(), rename(file.getName(), BADINDEX)); if (fileSystem.rename(inuse, badindex)) { LOG.error( "Directory found at [{0}] is not a vaild index, renaming to [{1}].", inuse, badindex); } else { LOG.fatal( "Directory found at [{0}] is not a vaild index, could not rename to [{1}].", inuse, badindex); } } } finally { hdfsDirectory.close(); } } else { LOG.fatal("Could not rename [{0}] to inuse dir.", file); } } } } catch (IOException e) { LOG.error("Unknown error while trying to refresh imports on [{1}/{2}].", e, _shard, _table); } } finally { _globalLock.unlock(); } }
From source file:org.apache.blur.manager.writer.IndexImporter.java
License:Apache License
public void cleanupOldDirs() throws IOException { Path hdfsDirPath = _shardContext.getHdfsDirPath(); TableContext tableContext = _shardContext.getTableContext(); Configuration configuration = tableContext.getConfiguration(); FileSystem fileSystem = hdfsDirPath.getFileSystem(configuration); FileStatus[] inuseSubDirs = fileSystem.listStatus(hdfsDirPath, new PathFilter() { @Override//from www . jav a 2 s. c o m public boolean accept(Path path) { return path.getName().endsWith(INUSE); } }); Set<Path> inuseDirs = toSet(inuseSubDirs); Map<Path, Path> inuseFileToDir = toMap(fileSystem, inuseDirs); FileStatus[] listStatus = fileSystem.listStatus(hdfsDirPath, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().endsWith(HdfsDirectory.LNK); } }); for (FileStatus status : listStatus) { Path realPath = HdfsDirectory.readRealPathDataFromSymlinkPath(fileSystem, status.getPath()); Path inuseDir = inuseFileToDir.get(realPath); inuseDirs.remove(inuseDir); // if the inuse dir has an inprogress file then remove it because there // are files that reference this dir so it had to be committed. Path path = new Path(inuseDir, INPROGRESS); if (fileSystem.exists(path)) { fileSystem.delete(path, false); if (_thriftCache != null) { _thriftCache.clearTable(_table); } } } // Check if any inuse dirs have inprogress files. // If they do, rename inuse to commit to retry import. for (Path inuse : new HashSet<Path>(inuseDirs)) { Path path = new Path(inuse, INPROGRESS); if (fileSystem.exists(path)) { LOG.info("Path [{0}] is not imported but has inprogress file, retrying import.", path); inuseDirs.remove(inuse); Path commit = new Path(inuse.getParent(), rename(inuse.getName(), COMMIT)); fileSystem.rename(inuse, commit); } } for (Path p : inuseDirs) { LOG.info("Deleting path [{0}] no longer in use.", p); fileSystem.delete(p, true); } }
From source file:org.apache.blur.mapreduce.lib.BlurInputFormat.java
License:Apache License
private static List<BlurInputSplit> getSegmentSplits(final Path dir, ExecutorService service, final Configuration configuration, final Text table, final Text snapshot) throws IOException { FileSystem fileSystem = dir.getFileSystem(configuration); FileStatus[] shardDirs = fileSystem.listStatus(dir, new PathFilter() { @Override//from ww w .ja va 2 s. com public boolean accept(Path path) { return path.getName().startsWith(BlurConstants.SHARD_PREFIX); } }); List<Future<List<BlurInputSplit>>> futures = new ArrayList<Future<List<BlurInputSplit>>>(); for (final FileStatus shardFileStatus : shardDirs) { futures.add(service.submit(new Callable<List<BlurInputSplit>>() { @Override public List<BlurInputSplit> call() throws Exception { return getSegmentSplits(shardFileStatus.getPath(), configuration, table, snapshot); } })); } List<BlurInputSplit> results = new ArrayList<BlurInputSplit>(); for (Future<List<BlurInputSplit>> future : futures) { try { results.addAll(future.get()); } catch (InterruptedException e) { throw new IOException(e); } catch (ExecutionException e) { Throwable cause = e.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } else { throw new IOException(cause); } } } return results; }