List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:com.cloudera.science.quince.SchemaUtils.java
License:Open Source License
public static Path findFile(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); if (fs.isDirectory(path)) { FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() { @Override//from w ww .j a va 2s . c om public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }); return fileStatuses[0].getPath(); } else { return path; } }
From source file:com.cloudera.seismic.segy.SegyUnloader.java
License:Open Source License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption("input", true, "SU sequence files to export from Hadoop"); options.addOption("output", true, "The local SU file to write"); // Parse the commandline and check for required arguments. CommandLine cmdLine = new PosixParser().parse(options, args, false); if (!cmdLine.hasOption("input") || !cmdLine.hasOption("output")) { System.out.println("Mising required input/output arguments"); new HelpFormatter().printHelp("SegyUnloader", options); System.exit(1);/*from w ww . ja v a 2s . c o m*/ } Configuration conf = getConf(); FileSystem hdfs = FileSystem.get(conf); Path inputPath = new Path(cmdLine.getOptionValue("input")); if (!hdfs.exists(inputPath)) { System.out.println("Input path does not exist"); System.exit(1); } PathFilter pf = new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; DataOutputStream os = new DataOutputStream(new FileOutputStream(cmdLine.getOptionValue("output"))); for (FileStatus fs : hdfs.listStatus(inputPath, pf)) { write(fs.getPath(), os, conf); } os.close(); return 0; }
From source file:com.dasasian.chok.operation.master.IndexDeployOperation.java
License:Apache License
protected static List<Shard> readShardsFromFs(final String indexName, final String indexPathString) throws IndexDeployException { // get shard folders from source URI uri;// w w w. ja va2s . co m try { uri = new URI(indexPathString); } catch (final URISyntaxException e) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to parse index path uri '" + indexPathString + "', make sure it starts with file:// or hdfs:// ", e); } FileSystem fileSystem; try { fileSystem = HadoopUtil.getFileSystem(new Path(uri.toString())); } catch (final IOException e) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to retrive file system for index path '" + indexPathString + "', make sure your path starts with hadoop support prefix like file:// or hdfs://", e); } List<Shard> shards = new ArrayList<>(); try { final Path indexPath = new Path(indexPathString); if (!fileSystem.exists(indexPath)) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index path '" + uri + "' does not exists"); } final FileStatus[] listStatus = fileSystem.listStatus(indexPath, new PathFilter() { public boolean accept(final Path aPath) { return !aPath.getName().startsWith("."); } }); for (final FileStatus fileStatus : listStatus) { String shardPath = fileStatus.getPath().toString(); if (fileStatus.isDir() || shardPath.endsWith(".zip")) { shards.add(new Shard(createShardName(indexName, shardPath), shardPath)); } } } catch (final IOException e) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "could not access index path: " + indexPathString, e); } if (shards.size() == 0) { throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index does not contain any shard"); } return shards; }
From source file:com.datatorrent.lib.io.fs.FileStitcher.java
License:Apache License
protected void mergeBlocks(T stitchedFileMetaData) throws IOException { //when writing to tmp files there can be vagrant tmp files which we have to clean final Path dst = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath()); PathFilter tempFileFilter = new PathFilter() { @Override//from www . jav a 2 s.c o m public boolean accept(Path path) { return path.getName().startsWith(dst.getName()) && path.getName().endsWith(PART_FILE_EXTENTION); } }; if (outputFS.exists(dst.getParent())) { FileStatus[] statuses = outputFS.listStatus(dst.getParent(), tempFileFilter); for (FileStatus status : statuses) { String statusName = status.getPath().getName(); LOG.debug("deleting vagrant file {}", statusName); outputFS.delete(status.getPath(), true); } } tempOutFilePath = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath() + '.' + System.currentTimeMillis() + PART_FILE_EXTENTION); try { writeTempOutputFile(stitchedFileMetaData); moveToFinalFile(stitchedFileMetaData); } catch (BlockNotFoundException e) { LOG.warn("Block file {} not found. Assuming recovery mode for file {}. ", e.getBlockPath(), stitchedFileMetaData.getStitchedFileRelativePath()); //Remove temp output file outputFS.delete(tempOutFilePath, false); } }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them. * This will use constant memory and will run at the speed of your disk read *//* ww w . ja v a 2 s. co m*/ private static List<Path> createDictionaryChunks(Path dictPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(dictPath.toUri(), conf); FileStatus[] dictFiles = fs.listStatus(dictPath, new PathFilter() { @Override public boolean accept(Path path) { String name = path.getName(); return name.startsWith("dictionary.") && !name.endsWith(".crc"); } }); for (int i = 0; i < dictFiles.length; i++) { chunkPaths.add(dictFiles[i].getPath()); } return chunkPaths; }
From source file:com.ery.dimport.daemon.TaskManager.java
License:Apache License
public void runTask(final TaskInfo task) { List<LogHostRunInfoPO> allFiles = new ArrayList<LogHostRunInfoPO>(); try {/*from w ww.ja v a2 s . c o m*/ task.START_TIME = new Date(System.currentTimeMillis()); boolean needUpdate = false; TaskInfo exists = allTask.get(task.TASK_ID); if (exists == null) { needUpdate = true; } else { task.hosts = exists.hosts; } if (task.hosts == null || task.hosts.size() == 0) { task.hosts = new ArrayList<String>(master.getServerManager().getOnlineServers().keySet()); needUpdate = true; } if (ZKUtil.checkExists(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID) == -1) { needUpdate = true; } if (needUpdate) { try { task.HOST_SIZE = task.hosts.size(); master.logWriter.writeLog(task); ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID, DImportConstant.Serialize(task)); } catch (Throwable e) { } } Thread thread = Thread.currentThread(); ProcessInfo procInfo = null; synchronized (taskInProgress) { procInfo = taskInProgress.get(task.getRunTaskId()); } procInfo.thread = thread; procInfo.startTime = System.currentTimeMillis(); procInfo.startTime = System.currentTimeMillis(); String filePath = task.FILE_PATH; boolean isInHdfs = false; final Map<String, Long> files = new HashMap<String, Long>(); String tmpPath = conf.get(DImportConstant.DIMPORT_PROCESS_TMPDATA_DIR, System.getProperty("user.home")); if (tmpPath.endsWith("/")) { tmpPath = tmpPath.substring(0, tmpPath.length() - 1); } if (filePath == null || filePath.equals("")) { files.put("", 0l); } else { if (task.fileNamePattern != null || (task.FILE_FILTER != null && !task.FILE_FILTER.equals(""))) { task.FILE_FILTER = DImportConstant.macroProcess(task.FILE_FILTER); task.FILE_FILTER = task.FILE_FILTER.replaceAll("\\{host\\}", this.master.hostName); task.fileNamePattern = Pattern.compile(task.FILE_FILTER); } Matcher m = hdfsUrlPattern.matcher(filePath); if (m.matches()) { isInHdfs = true; filePath = m.group(2); // for (String string : conf.getValByRegex(".*").keySet()) { // System.out.println(string + "=" + conf.get(string)); // } Path dirPath = new Path(filePath); FileSystem fs = FileSystem.get(HadoopConf.getConf(conf)); if (!fs.exists(dirPath) || !fs.isDirectory(dirPath)) { throw new IOException("HDFS? " + filePath + "?,?"); } FileStatus[] hFiles = fs.listStatus(dirPath, new PathFilter() { @Override public boolean accept(Path name) { if (task.fileNamePattern != null) { System.out.println("hdfs listStatus:" + name.getParent() + "/" + name.getName()); return task.fileNamePattern.matcher(name.getName()).matches(); } else { return true; } } }); for (int i = 0; i < hFiles.length; i++) { files.put(hFiles[i].getPath().toString(), hFiles[i].getLen()); } } else { java.io.File f = new File(filePath); if (!f.exists() || !f.isDirectory()) { throw new IOException( "? " + filePath + "? ,?"); } File[] lFiles = f.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { if (task.fileNamePattern != null) { System.out.println("local fs listStatus:" + dir + "/" + name); return task.fileNamePattern.matcher(name).matches(); } else { return true; } } }); for (int i = 0; i < lFiles.length; i++) { files.put(lFiles[i].getAbsolutePath(), lFiles[i].length()); } } } for (String fileName : files.keySet()) { LogHostRunInfoPO runInfo = new LogHostRunInfoPO(task); runInfo.RUN_LOG_ID = DImportConstant.shdf.format(task.SUBMIT_TIME) + "_" + allFiles.size() + "_" + fileName.hashCode(); runInfo.FILE_NAME = fileName; runInfo.RETURN_CODE = 255; runInfo.IS_RUN_SUCCESS = -1; runInfo.FILE_SIZE = files.get(fileName); runInfo.HOST_NAME = master.hostName; String localFile = fileName; if (isInHdfs) {// localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1); } // String[] cmds = procInfo.task.getCommand(); for (int j = 0; j < cmds.length; j++) { cmds[j] = DImportConstant.macroProcess(cmds[j]); cmds[j] = cmds[j].replaceAll("\\{file\\}", localFile); cmds[j] = cmds[j].replaceAll("\\{host\\}", master.hostName); } runInfo.RUN_COMMAND = StringUtils.join(" ", cmds); master.logWriter.writeLog(runInfo); LOG.info("??" + runInfo); allFiles.add(runInfo); } ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); for (LogHostRunInfoPO runInfo : allFiles) { if (procInfo.stoped) break; String fileName = runInfo.FILE_NAME; LOG.info("?:" + fileName); procInfo.RUN_LOG_ID = runInfo.RUN_LOG_ID; runInfo.START_TIME = new Date(System.currentTimeMillis()); procInfo.processFile = fileName; String localFile = fileName; try { if (isInHdfs) {// localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1); } procInfo.task.TASK_COMMAND = runInfo.RUN_COMMAND; if (isInHdfs) {// File lf = new File(localFile); if (lf.exists()) lf.delete(); FileSystem fs = FileSystem.get(HadoopConf.getConf(conf)); LOG.info("HDFS:" + fileName + "===>" + localFile); long btime = System.currentTimeMillis(); fs.copyToLocalFile(new Path(fileName), new Path(localFile)); LOG.info("HDFS?:" + fileName + "===>" + localFile); runInfo.downTime = System.currentTimeMillis() - btime; fileName = localFile; } updateHostInfoLog(runInfo, allFiles); LOG.info(procInfo.task.TASK_NAME + " commandline: " + procInfo.task.TASK_COMMAND); procInfo.proc = execResult(runInfo.RUN_COMMAND); runInfo.IS_RUN_SUCCESS = 1; runInfo.RETURN_CODE = writeProcessLog(procInfo); LOG.info(procInfo.task.TASK_NAME + " return value: " + runInfo.RETURN_CODE); // runInfo.RETURN_CODE = procInfo.proc.exitValue(); } catch (Throwable e) { runInfo.ERROR_MSG = e.getMessage(); if (procInfo.proc != null) { try { procInfo.proc.destroy(); } catch (Exception ex) { } } procInfo.proc = null; LOG.error("", e); } finally { // runInfo.END_TIME = new Date(System.currentTimeMillis()); master.logWriter.updateLog(runInfo); updateHostInfoLog(runInfo, allFiles); ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); if (isInHdfs) { File lf = new File(localFile); if (lf.exists()) lf.delete(); } } } } catch (Throwable e) { LOG.error("" + task, e); try { if (allFiles.size() > 0) { for (LogHostRunInfoPO logHostRunInfoPO : allFiles) { if (logHostRunInfoPO.END_TIME.getTime() < 10000) { logHostRunInfoPO.END_TIME = new Date(System.currentTimeMillis()); logHostRunInfoPO.IS_RUN_SUCCESS = 1; logHostRunInfoPO.RETURN_CODE = 2; } } ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); } } catch (KeeperException e1) { LOG.error("update task run info on host :" + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, e); } catch (IOException e1) { LOG.error("update task run info on host " + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, e); } } finally { // synchronized (taskInProgress) { taskInProgress.remove(task.getRunTaskId()); } } }
From source file:com.flipkart.fdp.migration.distcp.core.MirrorDCMImpl.java
License:Apache License
public static void HackMapreduce() throws Exception { DCMConstants.setFinalStatic(/* w w w . j av a 2 s . c o m*/ org.apache.hadoop.mapreduce.lib.input.FileInputFormat.class.getDeclaredField("hiddenFileFilter"), new PathFilter() { public boolean accept(Path p) { return true; } }); DCMConstants.setFinalStatic( org.apache.hadoop.mapred.FileInputFormat.class.getDeclaredField("hiddenFileFilter"), new PathFilter() { public boolean accept(Path p) { return true; } }); }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HdfsSortedOplogOrganizer.java
License:Apache License
protected FileStatus[] getExpiryMarkers() throws IOException { FileSystem fs = store.getFileSystem(); if (hoplogReadersController.hoplogs == null || hoplogReadersController.hoplogs.size() == 0) { // there are no hoplogs in the system. May be the bucket is not existing // at all. if (!fs.exists(bucketPath)) { if (logger.isDebugEnabled()) logger.debug("{}This bucket is unused, skipping expired hoplog check", logPrefix); return null; }/*from ww w. ja v a 2 s .c om*/ } FileStatus files[] = FSUtils.listStatus(fs, bucketPath, new PathFilter() { @Override public boolean accept(Path file) { // All expired hoplog end with expire extension and must match the valid file regex String fileName = file.getName(); if (!fileName.endsWith(EXPIRED_HOPLOG_EXTENSION)) { return false; } fileName = truncateExpiryExtension(fileName); Matcher matcher = SORTED_HOPLOG_PATTERN.matcher(fileName); return matcher.find(); } }); return files; }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java
License:Apache License
public HDFSUnsortedHoplogOrganizer(HdfsRegionManager region, int bucketId) throws IOException { super(region, bucketId); writer = null;//from ww w . ja v a 2 s . co m sequence = new AtomicInteger(0); fileSystem = store.getFileSystem(); if (!fileSystem.exists(bucketPath)) { return; } FileStatus validHoplogs[] = FSUtils.listStatus(fileSystem, bucketPath, new PathFilter() { @Override public boolean accept(Path file) { // All valid hoplog files must match the regex Matcher matcher = HOPLOG_PATTERN.matcher(file.getName()); return matcher.matches(); } }); if (validHoplogs != null && validHoplogs.length > 0) { for (FileStatus file : validHoplogs) { // account for the disk used by this file incrementDiskUsage(file.getLen()); } } }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java
License:Apache License
/** * Fixes the size of hoplogs that were not closed properly last time. * Such hoplogs are *.tmphop files. Identify them and open them and close * them, this fixes the size. After doing this rename them to *.hop. * // w w w. j av a 2 s. c om * @throws IOException * @throws ForceReattemptException */ void identifyAndFixTmpHoplogs(FileSystem fs) throws IOException, ForceReattemptException { if (logger.isDebugEnabled()) logger.debug("{}Fixing temporary hoplogs", logPrefix); // A different filesystem is passed to this function for the following reason: // For HDFS, if a file wasn't closed properly last time, // while calling FileSystem.append for this file, FSNamesystem.startFileInternal-> // FSNamesystem.recoverLeaseInternal function gets called. // This function throws AlreadyBeingCreatedException if there is an open handle, to any other file, // created using the same FileSystem object. This is a bug and is being tracked at: // https://issues.apache.org/jira/browse/HDFS-3848?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel // // The fix for this bug is not yet part of Pivotal HD. So to overcome the bug, // we create a new file system for the timer task so that it does not encounter the bug. FileStatus tmpHoplogs[] = FSUtils.listStatus(fs, fs.makeQualified(bucketPath), new PathFilter() { @Override public boolean accept(Path file) { // All valid hoplog files must match the regex Matcher matcher = patternForTmpHoplog.matcher(file.getName()); return matcher.matches(); } }); if (tmpHoplogs == null || tmpHoplogs.length == 0) { if (logger.isDebugEnabled()) logger.debug("{}No files to fix", logPrefix); return; } // ping secondaries so that in case of split brain, no other vm has taken up // as primary. #50110. pingSecondaries(); if (logger.isDebugEnabled()) logger.debug("{}Files to fix " + tmpHoplogs.length, logPrefix); String currentHoplogName = null; // get the current hoplog name. We need to ignore current hoplog while fixing. if (currentHoplog != null) { currentHoplogName = currentHoplog.getFileName(); } for (int i = 0; i < tmpHoplogs.length; i++) { // Skip directories if (tmpHoplogs[i].isDirectory()) { continue; } final Path p = tmpHoplogs[i].getPath(); if (tmpHoplogs[i].getPath().getName().equals(currentHoplogName)) { if (logger.isDebugEnabled()) logger.debug("Skipping current file: " + tmpHoplogs[i].getPath().getName(), logPrefix); continue; } SequenceFileHoplog hoplog = new SequenceFileHoplog(fs, p, stats); try { makeLegitimate(hoplog); logger.info(LocalizedMessage.create(LocalizedStrings.DEBUG, "Hoplog " + p + " was a temporary " + "hoplog because the node managing it wasn't shutdown properly last time. Fixed the hoplog name.")); } catch (IOException e) { logger.info(LocalizedMessage.create(LocalizedStrings.DEBUG, "Hoplog " + p + " is still a temporary " + "hoplog because the node managing it wasn't shutdown properly last time. Failed to " + "change the hoplog name because an exception was thrown while fixing it. " + e)); } } }