Example usage for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:com.cloudera.science.quince.SchemaUtils.java

License:Open Source License

public static Path findFile(Path path, Configuration conf) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.isDirectory(path)) {
        FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {
            @Override//from w ww .j a va 2s .  c om
            public boolean accept(Path p) {
                String name = p.getName();
                return !name.startsWith("_") && !name.startsWith(".");
            }
        });
        return fileStatuses[0].getPath();
    } else {
        return path;
    }
}

From source file:com.cloudera.seismic.segy.SegyUnloader.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption("input", true, "SU sequence files to export from Hadoop");
    options.addOption("output", true, "The local SU file to write");

    // Parse the commandline and check for required arguments.
    CommandLine cmdLine = new PosixParser().parse(options, args, false);
    if (!cmdLine.hasOption("input") || !cmdLine.hasOption("output")) {
        System.out.println("Mising required input/output arguments");
        new HelpFormatter().printHelp("SegyUnloader", options);
        System.exit(1);/*from  w  ww .  ja  v a 2s . c o  m*/
    }

    Configuration conf = getConf();
    FileSystem hdfs = FileSystem.get(conf);
    Path inputPath = new Path(cmdLine.getOptionValue("input"));
    if (!hdfs.exists(inputPath)) {
        System.out.println("Input path does not exist");
        System.exit(1);
    }

    PathFilter pf = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };

    DataOutputStream os = new DataOutputStream(new FileOutputStream(cmdLine.getOptionValue("output")));
    for (FileStatus fs : hdfs.listStatus(inputPath, pf)) {
        write(fs.getPath(), os, conf);
    }
    os.close();

    return 0;
}

From source file:com.dasasian.chok.operation.master.IndexDeployOperation.java

License:Apache License

protected static List<Shard> readShardsFromFs(final String indexName, final String indexPathString)
        throws IndexDeployException {
    // get shard folders from source
    URI uri;//  w w  w. ja va2s . co m
    try {
        uri = new URI(indexPathString);
    } catch (final URISyntaxException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to parse index path uri '"
                + indexPathString + "', make sure it starts with file:// or hdfs:// ", e);
    }
    FileSystem fileSystem;
    try {
        fileSystem = HadoopUtil.getFileSystem(new Path(uri.toString()));
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "unable to retrive file system for index path '" + indexPathString
                        + "', make sure your path starts with hadoop support prefix like file:// or hdfs://",
                e);
    }

    List<Shard> shards = new ArrayList<>();
    try {
        final Path indexPath = new Path(indexPathString);
        if (!fileSystem.exists(indexPath)) {
            throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                    "index path '" + uri + "' does not exists");
        }
        final FileStatus[] listStatus = fileSystem.listStatus(indexPath, new PathFilter() {
            public boolean accept(final Path aPath) {
                return !aPath.getName().startsWith(".");
            }
        });
        for (final FileStatus fileStatus : listStatus) {
            String shardPath = fileStatus.getPath().toString();
            if (fileStatus.isDir() || shardPath.endsWith(".zip")) {
                shards.add(new Shard(createShardName(indexName, shardPath), shardPath));
            }
        }
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "could not access index path: " + indexPathString, e);
    }

    if (shards.size() == 0) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index does not contain any shard");
    }
    return shards;
}

From source file:com.datatorrent.lib.io.fs.FileStitcher.java

License:Apache License

protected void mergeBlocks(T stitchedFileMetaData) throws IOException {
    //when writing to tmp files there can be vagrant tmp files which we have to clean
    final Path dst = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath());
    PathFilter tempFileFilter = new PathFilter() {
        @Override//from  www  .  jav a 2  s.c  o  m
        public boolean accept(Path path) {
            return path.getName().startsWith(dst.getName()) && path.getName().endsWith(PART_FILE_EXTENTION);
        }
    };
    if (outputFS.exists(dst.getParent())) {
        FileStatus[] statuses = outputFS.listStatus(dst.getParent(), tempFileFilter);
        for (FileStatus status : statuses) {
            String statusName = status.getPath().getName();
            LOG.debug("deleting vagrant file {}", statusName);
            outputFS.delete(status.getPath(), true);
        }
    }
    tempOutFilePath = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath() + '.'
            + System.currentTimeMillis() + PART_FILE_EXTENTION);
    try {
        writeTempOutputFile(stitchedFileMetaData);
        moveToFinalFile(stitchedFileMetaData);
    } catch (BlockNotFoundException e) {
        LOG.warn("Block file {} not found. Assuming recovery mode for file {}. ", e.getBlockPath(),
                stitchedFileMetaData.getStitchedFileRelativePath());
        //Remove temp output file
        outputFS.delete(tempOutFilePath, false);
    }
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
 * This will use constant memory and will run at the speed of your disk read
 *//* ww w . ja  v  a 2 s.  co m*/
private static List<Path> createDictionaryChunks(Path dictPath, Path dictionaryPathBase, Configuration baseConf,
        int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(dictPath.toUri(), conf);
    FileStatus[] dictFiles = fs.listStatus(dictPath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            String name = path.getName();
            return name.startsWith("dictionary.") && !name.endsWith(".crc");
        }
    });
    for (int i = 0; i < dictFiles.length; i++) {
        chunkPaths.add(dictFiles[i].getPath());
    }

    return chunkPaths;
}

From source file:com.ery.dimport.daemon.TaskManager.java

License:Apache License

public void runTask(final TaskInfo task) {
    List<LogHostRunInfoPO> allFiles = new ArrayList<LogHostRunInfoPO>();
    try {/*from w  ww.ja v  a2 s  . c  o  m*/
        task.START_TIME = new Date(System.currentTimeMillis());
        boolean needUpdate = false;
        TaskInfo exists = allTask.get(task.TASK_ID);
        if (exists == null) {
            needUpdate = true;
        } else {
            task.hosts = exists.hosts;
        }
        if (task.hosts == null || task.hosts.size() == 0) {
            task.hosts = new ArrayList<String>(master.getServerManager().getOnlineServers().keySet());
            needUpdate = true;
        }
        if (ZKUtil.checkExists(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID) == -1) {
            needUpdate = true;
        }
        if (needUpdate) {
            try {
                task.HOST_SIZE = task.hosts.size();
                master.logWriter.writeLog(task);
                ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID,
                        DImportConstant.Serialize(task));
            } catch (Throwable e) {
            }
        }
        Thread thread = Thread.currentThread();
        ProcessInfo procInfo = null;
        synchronized (taskInProgress) {
            procInfo = taskInProgress.get(task.getRunTaskId());
        }
        procInfo.thread = thread;
        procInfo.startTime = System.currentTimeMillis();
        procInfo.startTime = System.currentTimeMillis();
        String filePath = task.FILE_PATH;
        boolean isInHdfs = false;
        final Map<String, Long> files = new HashMap<String, Long>();
        String tmpPath = conf.get(DImportConstant.DIMPORT_PROCESS_TMPDATA_DIR, System.getProperty("user.home"));
        if (tmpPath.endsWith("/")) {
            tmpPath = tmpPath.substring(0, tmpPath.length() - 1);
        }
        if (filePath == null || filePath.equals("")) {
            files.put("", 0l);
        } else {
            if (task.fileNamePattern != null || (task.FILE_FILTER != null && !task.FILE_FILTER.equals(""))) {
                task.FILE_FILTER = DImportConstant.macroProcess(task.FILE_FILTER);
                task.FILE_FILTER = task.FILE_FILTER.replaceAll("\\{host\\}", this.master.hostName);
                task.fileNamePattern = Pattern.compile(task.FILE_FILTER);
            }
            Matcher m = hdfsUrlPattern.matcher(filePath);
            if (m.matches()) {
                isInHdfs = true;
                filePath = m.group(2);
                // for (String string : conf.getValByRegex(".*").keySet()) {
                // System.out.println(string + "=" + conf.get(string));
                // }
                Path dirPath = new Path(filePath);
                FileSystem fs = FileSystem.get(HadoopConf.getConf(conf));
                if (!fs.exists(dirPath) || !fs.isDirectory(dirPath)) {
                    throw new IOException("HDFS? " + filePath + "?,?");
                }
                FileStatus[] hFiles = fs.listStatus(dirPath, new PathFilter() {
                    @Override
                    public boolean accept(Path name) {
                        if (task.fileNamePattern != null) {
                            System.out.println("hdfs listStatus:" + name.getParent() + "/" + name.getName());
                            return task.fileNamePattern.matcher(name.getName()).matches();
                        } else {
                            return true;
                        }
                    }
                });
                for (int i = 0; i < hFiles.length; i++) {
                    files.put(hFiles[i].getPath().toString(), hFiles[i].getLen());
                }
            } else {
                java.io.File f = new File(filePath);
                if (!f.exists() || !f.isDirectory()) {
                    throw new IOException(
                            "? " + filePath + "? ,?");
                }
                File[] lFiles = f.listFiles(new FilenameFilter() {
                    public boolean accept(File dir, String name) {
                        if (task.fileNamePattern != null) {
                            System.out.println("local fs listStatus:" + dir + "/" + name);
                            return task.fileNamePattern.matcher(name).matches();
                        } else {
                            return true;
                        }
                    }
                });
                for (int i = 0; i < lFiles.length; i++) {
                    files.put(lFiles[i].getAbsolutePath(), lFiles[i].length());
                }
            }
        }
        for (String fileName : files.keySet()) {
            LogHostRunInfoPO runInfo = new LogHostRunInfoPO(task);
            runInfo.RUN_LOG_ID = DImportConstant.shdf.format(task.SUBMIT_TIME) + "_" + allFiles.size() + "_"
                    + fileName.hashCode();
            runInfo.FILE_NAME = fileName;
            runInfo.RETURN_CODE = 255;
            runInfo.IS_RUN_SUCCESS = -1;
            runInfo.FILE_SIZE = files.get(fileName);
            runInfo.HOST_NAME = master.hostName;
            String localFile = fileName;
            if (isInHdfs) {// 
                localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1);
            }
            // 
            String[] cmds = procInfo.task.getCommand();
            for (int j = 0; j < cmds.length; j++) {
                cmds[j] = DImportConstant.macroProcess(cmds[j]);
                cmds[j] = cmds[j].replaceAll("\\{file\\}", localFile);
                cmds[j] = cmds[j].replaceAll("\\{host\\}", master.hostName);
            }
            runInfo.RUN_COMMAND = StringUtils.join(" ", cmds);
            master.logWriter.writeLog(runInfo);
            LOG.info("??" + runInfo);
            allFiles.add(runInfo);
        }
        ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                DImportConstant.Serialize(allFiles));
        for (LogHostRunInfoPO runInfo : allFiles) {
            if (procInfo.stoped)
                break;
            String fileName = runInfo.FILE_NAME;
            LOG.info("?:" + fileName);
            procInfo.RUN_LOG_ID = runInfo.RUN_LOG_ID;
            runInfo.START_TIME = new Date(System.currentTimeMillis());
            procInfo.processFile = fileName;
            String localFile = fileName;
            try {
                if (isInHdfs) {// 
                    localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1);
                }
                procInfo.task.TASK_COMMAND = runInfo.RUN_COMMAND;
                if (isInHdfs) {// 
                    File lf = new File(localFile);
                    if (lf.exists())
                        lf.delete();
                    FileSystem fs = FileSystem.get(HadoopConf.getConf(conf));
                    LOG.info("HDFS:" + fileName + "===>" + localFile);
                    long btime = System.currentTimeMillis();
                    fs.copyToLocalFile(new Path(fileName), new Path(localFile));
                    LOG.info("HDFS?:" + fileName + "===>" + localFile);
                    runInfo.downTime = System.currentTimeMillis() - btime;
                    fileName = localFile;
                }
                updateHostInfoLog(runInfo, allFiles);
                LOG.info(procInfo.task.TASK_NAME + " commandline: " + procInfo.task.TASK_COMMAND);
                procInfo.proc = execResult(runInfo.RUN_COMMAND);
                runInfo.IS_RUN_SUCCESS = 1;
                runInfo.RETURN_CODE = writeProcessLog(procInfo);
                LOG.info(procInfo.task.TASK_NAME + " return value: " + runInfo.RETURN_CODE);
                // runInfo.RETURN_CODE = procInfo.proc.exitValue();
            } catch (Throwable e) {
                runInfo.ERROR_MSG = e.getMessage();
                if (procInfo.proc != null) {
                    try {
                        procInfo.proc.destroy();
                    } catch (Exception ex) {
                    }
                }
                procInfo.proc = null;
                LOG.error("", e);
            } finally { // 
                runInfo.END_TIME = new Date(System.currentTimeMillis());
                master.logWriter.updateLog(runInfo);
                updateHostInfoLog(runInfo, allFiles);
                ZKUtil.createSetData(watcher,
                        watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                        DImportConstant.Serialize(allFiles));
                if (isInHdfs) {
                    File lf = new File(localFile);
                    if (lf.exists())
                        lf.delete();
                }
            }
        }
    } catch (Throwable e) {
        LOG.error("" + task, e);
        try {
            if (allFiles.size() > 0) {
                for (LogHostRunInfoPO logHostRunInfoPO : allFiles) {
                    if (logHostRunInfoPO.END_TIME.getTime() < 10000) {
                        logHostRunInfoPO.END_TIME = new Date(System.currentTimeMillis());
                        logHostRunInfoPO.IS_RUN_SUCCESS = 1;
                        logHostRunInfoPO.RETURN_CODE = 2;
                    }
                }
                ZKUtil.createSetData(watcher,
                        watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                        DImportConstant.Serialize(allFiles));
            }
        } catch (KeeperException e1) {
            LOG.error("update task run info on host :" + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/"
                    + master.hostName, e);
        } catch (IOException e1) {
            LOG.error("update task run info on host " + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/"
                    + master.hostName, e);
        }
    } finally { // 
        synchronized (taskInProgress) {
            taskInProgress.remove(task.getRunTaskId());
        }
    }
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorDCMImpl.java

License:Apache License

public static void HackMapreduce() throws Exception {

    DCMConstants.setFinalStatic(/*  w w w . j av a  2 s . c  o m*/
            org.apache.hadoop.mapreduce.lib.input.FileInputFormat.class.getDeclaredField("hiddenFileFilter"),
            new PathFilter() {
                public boolean accept(Path p) {
                    return true;
                }
            });
    DCMConstants.setFinalStatic(
            org.apache.hadoop.mapred.FileInputFormat.class.getDeclaredField("hiddenFileFilter"),
            new PathFilter() {
                public boolean accept(Path p) {
                    return true;
                }
            });
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HdfsSortedOplogOrganizer.java

License:Apache License

protected FileStatus[] getExpiryMarkers() throws IOException {
    FileSystem fs = store.getFileSystem();
    if (hoplogReadersController.hoplogs == null || hoplogReadersController.hoplogs.size() == 0) {
        // there are no hoplogs in the system. May be the bucket is not existing
        // at all.
        if (!fs.exists(bucketPath)) {
            if (logger.isDebugEnabled())
                logger.debug("{}This bucket is unused, skipping expired hoplog check", logPrefix);
            return null;
        }/*from ww  w. ja v a  2  s .c  om*/
    }

    FileStatus files[] = FSUtils.listStatus(fs, bucketPath, new PathFilter() {
        @Override
        public boolean accept(Path file) {
            // All expired hoplog end with expire extension and must match the valid file regex
            String fileName = file.getName();
            if (!fileName.endsWith(EXPIRED_HOPLOG_EXTENSION)) {
                return false;
            }
            fileName = truncateExpiryExtension(fileName);
            Matcher matcher = SORTED_HOPLOG_PATTERN.matcher(fileName);
            return matcher.find();
        }

    });
    return files;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java

License:Apache License

public HDFSUnsortedHoplogOrganizer(HdfsRegionManager region, int bucketId) throws IOException {
    super(region, bucketId);
    writer = null;//from   ww w .  ja  v a  2 s .  co m
    sequence = new AtomicInteger(0);

    fileSystem = store.getFileSystem();
    if (!fileSystem.exists(bucketPath)) {
        return;
    }

    FileStatus validHoplogs[] = FSUtils.listStatus(fileSystem, bucketPath, new PathFilter() {
        @Override
        public boolean accept(Path file) {
            // All valid hoplog files must match the regex
            Matcher matcher = HOPLOG_PATTERN.matcher(file.getName());
            return matcher.matches();
        }
    });

    if (validHoplogs != null && validHoplogs.length > 0) {
        for (FileStatus file : validHoplogs) {
            // account for the disk used by this file
            incrementDiskUsage(file.getLen());
        }
    }

}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java

License:Apache License

/**
 * Fixes the size of hoplogs that were not closed properly last time. 
 * Such hoplogs are *.tmphop files. Identify them and open them and close 
 * them, this fixes the size. After doing this rename them to *.hop. 
 * //  w  w  w.  j av a 2  s. c om
 * @throws IOException
 * @throws ForceReattemptException 
 */
void identifyAndFixTmpHoplogs(FileSystem fs) throws IOException, ForceReattemptException {
    if (logger.isDebugEnabled())
        logger.debug("{}Fixing temporary hoplogs", logPrefix);

    // A different filesystem is passed to this function for the following reason: 
    // For HDFS, if a file wasn't closed properly last time, 
    // while calling FileSystem.append for this file, FSNamesystem.startFileInternal->
    // FSNamesystem.recoverLeaseInternal function gets called. 
    // This function throws AlreadyBeingCreatedException if there is an open handle, to any other file, 
    // created using the same FileSystem object. This is a bug and is being tracked at: 
    // https://issues.apache.org/jira/browse/HDFS-3848?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
    // 
    // The fix for this bug is not yet part of Pivotal HD. So to overcome the bug, 
    // we create a new file system for the timer task so that it does not encounter the bug. 

    FileStatus tmpHoplogs[] = FSUtils.listStatus(fs, fs.makeQualified(bucketPath), new PathFilter() {
        @Override
        public boolean accept(Path file) {
            // All valid hoplog files must match the regex
            Matcher matcher = patternForTmpHoplog.matcher(file.getName());
            return matcher.matches();
        }
    });

    if (tmpHoplogs == null || tmpHoplogs.length == 0) {
        if (logger.isDebugEnabled())
            logger.debug("{}No files to fix", logPrefix);
        return;
    }
    // ping secondaries so that in case of split brain, no other vm has taken up 
    // as primary. #50110. 
    pingSecondaries();
    if (logger.isDebugEnabled())
        logger.debug("{}Files to fix " + tmpHoplogs.length, logPrefix);

    String currentHoplogName = null;
    // get the current hoplog name. We need to ignore current hoplog while fixing. 
    if (currentHoplog != null) {
        currentHoplogName = currentHoplog.getFileName();
    }

    for (int i = 0; i < tmpHoplogs.length; i++) {
        // Skip directories
        if (tmpHoplogs[i].isDirectory()) {
            continue;
        }

        final Path p = tmpHoplogs[i].getPath();

        if (tmpHoplogs[i].getPath().getName().equals(currentHoplogName)) {
            if (logger.isDebugEnabled())
                logger.debug("Skipping current file: " + tmpHoplogs[i].getPath().getName(), logPrefix);
            continue;
        }

        SequenceFileHoplog hoplog = new SequenceFileHoplog(fs, p, stats);
        try {
            makeLegitimate(hoplog);
            logger.info(LocalizedMessage.create(LocalizedStrings.DEBUG, "Hoplog " + p + " was a temporary "
                    + "hoplog because the node managing it wasn't shutdown properly last time. Fixed the hoplog name."));
        } catch (IOException e) {
            logger.info(LocalizedMessage.create(LocalizedStrings.DEBUG, "Hoplog " + p + " is still a temporary "
                    + "hoplog because the node managing it wasn't shutdown properly last time. Failed to "
                    + "change the hoplog name because an exception was thrown while fixing it. " + e));
        }
    }
}