Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:com.cloudera.science.quince.SchemaUtils.java

License:Open Source License

public static Path findFile(Path path, Configuration conf) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.isDirectory(path)) {
        FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {
            @Override//from w ww .j a va 2s .  c om
            public boolean accept(Path p) {
                String name = p.getName();
                return !name.startsWith("_") && !name.startsWith(".");
            }
        });
        return fileStatuses[0].getPath();
    } else {
        return path;
    }
}

From source file:com.cloudera.seismic.segy.SegyUnloader.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption("input", true, "SU sequence files to export from Hadoop");
    options.addOption("output", true, "The local SU file to write");

    // Parse the commandline and check for required arguments.
    CommandLine cmdLine = new PosixParser().parse(options, args, false);
    if (!cmdLine.hasOption("input") || !cmdLine.hasOption("output")) {
        System.out.println("Mising required input/output arguments");
        new HelpFormatter().printHelp("SegyUnloader", options);
        System.exit(1);/*from  w  ww .  ja  v a 2s . c o  m*/
    }

    Configuration conf = getConf();
    FileSystem hdfs = FileSystem.get(conf);
    Path inputPath = new Path(cmdLine.getOptionValue("input"));
    if (!hdfs.exists(inputPath)) {
        System.out.println("Input path does not exist");
        System.exit(1);
    }

    PathFilter pf = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };

    DataOutputStream os = new DataOutputStream(new FileOutputStream(cmdLine.getOptionValue("output")));
    for (FileStatus fs : hdfs.listStatus(inputPath, pf)) {
        write(fs.getPath(), os, conf);
    }
    os.close();

    return 0;
}

From source file:com.dasasian.chok.operation.master.IndexDeployOperation.java

License:Apache License

protected static List<Shard> readShardsFromFs(final String indexName, final String indexPathString)
        throws IndexDeployException {
    // get shard folders from source
    URI uri;//  w w  w. ja va2s . co m
    try {
        uri = new URI(indexPathString);
    } catch (final URISyntaxException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to parse index path uri '"
                + indexPathString + "', make sure it starts with file:// or hdfs:// ", e);
    }
    FileSystem fileSystem;
    try {
        fileSystem = HadoopUtil.getFileSystem(new Path(uri.toString()));
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "unable to retrive file system for index path '" + indexPathString
                        + "', make sure your path starts with hadoop support prefix like file:// or hdfs://",
                e);
    }

    List<Shard> shards = new ArrayList<>();
    try {
        final Path indexPath = new Path(indexPathString);
        if (!fileSystem.exists(indexPath)) {
            throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                    "index path '" + uri + "' does not exists");
        }
        final FileStatus[] listStatus = fileSystem.listStatus(indexPath, new PathFilter() {
            public boolean accept(final Path aPath) {
                return !aPath.getName().startsWith(".");
            }
        });
        for (final FileStatus fileStatus : listStatus) {
            String shardPath = fileStatus.getPath().toString();
            if (fileStatus.isDir() || shardPath.endsWith(".zip")) {
                shards.add(new Shard(createShardName(indexName, shardPath), shardPath));
            }
        }
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "could not access index path: " + indexPathString, e);
    }

    if (shards.size() == 0) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index does not contain any shard");
    }
    return shards;
}

From source file:com.datatorrent.lib.io.fs.FileStitcher.java

License:Apache License

protected void mergeBlocks(T stitchedFileMetaData) throws IOException {
    //when writing to tmp files there can be vagrant tmp files which we have to clean
    final Path dst = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath());
    PathFilter tempFileFilter = new PathFilter() {
        @Override//from  www  .  jav a 2  s.c  o  m
        public boolean accept(Path path) {
            return path.getName().startsWith(dst.getName()) && path.getName().endsWith(PART_FILE_EXTENTION);
        }
    };
    if (outputFS.exists(dst.getParent())) {
        FileStatus[] statuses = outputFS.listStatus(dst.getParent(), tempFileFilter);
        for (FileStatus status : statuses) {
            String statusName = status.getPath().getName();
            LOG.debug("deleting vagrant file {}", statusName);
            outputFS.delete(status.getPath(), true);
        }
    }
    tempOutFilePath = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath() + '.'
            + System.currentTimeMillis() + PART_FILE_EXTENTION);
    try {
        writeTempOutputFile(stitchedFileMetaData);
        moveToFinalFile(stitchedFileMetaData);
    } catch (BlockNotFoundException e) {
        LOG.warn("Block file {} not found. Assuming recovery mode for file {}. ", e.getBlockPath(),
                stitchedFileMetaData.getStitchedFileRelativePath());
        //Remove temp output file
        outputFS.delete(tempOutFilePath, false);
    }
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
 * This will use constant memory and will run at the speed of your disk read
 *//* ww w . ja  v  a 2 s.  co m*/
private static List<Path> createDictionaryChunks(Path dictPath, Path dictionaryPathBase, Configuration baseConf,
        int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(dictPath.toUri(), conf);
    FileStatus[] dictFiles = fs.listStatus(dictPath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            String name = path.getName();
            return name.startsWith("dictionary.") && !name.endsWith(".crc");
        }
    });
    for (int i = 0; i < dictFiles.length; i++) {
        chunkPaths.add(dictFiles[i].getPath());
    }

    return chunkPaths;
}

From source file:com.ery.dimport.daemon.TaskManager.java

License:Apache License

public void runTask(final TaskInfo task) {
    List<LogHostRunInfoPO> allFiles = new ArrayList<LogHostRunInfoPO>();
    try {/*from w  ww.ja v  a2 s  . c  o  m*/
        task.START_TIME = new Date(System.currentTimeMillis());
        boolean needUpdate = false;
        TaskInfo exists = allTask.get(task.TASK_ID);
        if (exists == null) {
            needUpdate = true;
        } else {
            task.hosts = exists.hosts;
        }
        if (task.hosts == null || task.hosts.size() == 0) {
            task.hosts = new ArrayList<String>(master.getServerManager().getOnlineServers().keySet());
            needUpdate = true;
        }
        if (ZKUtil.checkExists(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID) == -1) {
            needUpdate = true;
        }
        if (needUpdate) {
            try {
                task.HOST_SIZE = task.hosts.size();
                master.logWriter.writeLog(task);
                ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID,
                        DImportConstant.Serialize(task));
            } catch (Throwable e) {
            }
        }
        Thread thread = Thread.currentThread();
        ProcessInfo procInfo = null;
        synchronized (taskInProgress) {
            procInfo = taskInProgress.get(task.getRunTaskId());
        }
        procInfo.thread = thread;
        procInfo.startTime = System.currentTimeMillis();
        procInfo.startTime = System.currentTimeMillis();
        String filePath = task.FILE_PATH;
        boolean isInHdfs = false;
        final Map<String, Long> files = new HashMap<String, Long>();
        String tmpPath = conf.get(DImportConstant.DIMPORT_PROCESS_TMPDATA_DIR, System.getProperty("user.home"));
        if (tmpPath.endsWith("/")) {
            tmpPath = tmpPath.substring(0, tmpPath.length() - 1);
        }
        if (filePath == null || filePath.equals("")) {
            files.put("", 0l);
        } else {
            if (task.fileNamePattern != null || (task.FILE_FILTER != null && !task.FILE_FILTER.equals(""))) {
                task.FILE_FILTER = DImportConstant.macroProcess(task.FILE_FILTER);
                task.FILE_FILTER = task.FILE_FILTER.replaceAll("\\{host\\}", this.master.hostName);
                task.fileNamePattern = Pattern.compile(task.FILE_FILTER);
            }
            Matcher m = hdfsUrlPattern.matcher(filePath);
            if (m.matches()) {
                isInHdfs = true;
                filePath = m.group(2);
                // for (String string : conf.getValByRegex(".*").keySet()) {
                // System.out.println(string + "=" + conf.get(string));
                // }
                Path dirPath = new Path(filePath);
                FileSystem fs = FileSystem.get(HadoopConf.getConf(conf));
                if (!fs.exists(dirPath) || !fs.isDirectory(dirPath)) {
                    throw new IOException("HDFS? " + filePath + "?,?");
                }
                FileStatus[] hFiles = fs.listStatus(dirPath, new PathFilter() {
                    @Override
                    public boolean accept(Path name) {
                        if (task.fileNamePattern != null) {
                            System.out.println("hdfs listStatus:" + name.getParent() + "/" + name.getName());
                            return task.fileNamePattern.matcher(name.getName()).matches();
                        } else {
                            return true;
                        }
                    }
                });
                for (int i = 0; i < hFiles.length; i++) {
                    files.put(hFiles[i].getPath().toString(), hFiles[i].getLen());
                }
            } else {
                java.io.File f = new File(filePath);
                if (!f.exists() || !f.isDirectory()) {
                    throw new IOException(
                            "? " + filePath + "? ,?");
                }
                File[] lFiles = f.listFiles(new FilenameFilter() {
                    public boolean accept(File dir, String name) {
                        if (task.fileNamePattern != null) {
                            System.out.println("local fs listStatus:" + dir + "/" + name);
                            return task.fileNamePattern.matcher(name).matches();
                        } else {
                            return true;
                        }
                    }
                });
                for (int i = 0; i < lFiles.length; i++) {
                    files.put(lFiles[i].getAbsolutePath(), lFiles[i].length());
                }
            }
        }
        for (String fileName : files.keySet()) {
            LogHostRunInfoPO runInfo = new LogHostRunInfoPO(task);
            runInfo.RUN_LOG_ID = DImportConstant.shdf.format(task.SUBMIT_TIME) + "_" + allFiles.size() + "_"
                    + fileName.hashCode();
            runInfo.FILE_NAME = fileName;
            runInfo.RETURN_CODE = 255;
            runInfo.IS_RUN_SUCCESS = -1;
            runInfo.FILE_SIZE = files.get(fileName);
            runInfo.HOST_NAME = master.hostName;
            String localFile = fileName;
            if (isInHdfs) {// 
                localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1);
            }
            // 
            String[] cmds = procInfo.task.getCommand();
            for (int j = 0; j < cmds.length; j++) {
                cmds[j] = DImportConstant.macroProcess(cmds[j]);
                cmds[j] = cmds[j].replaceAll("\\{file\\}", localFile);
                cmds[j] = cmds[j].replaceAll("\\{host\\}", master.hostName);
            }
            runInfo.RUN_COMMAND = StringUtils.join(" ", cmds);
            master.logWriter.writeLog(runInfo);
            LOG.info("??" + runInfo);
            allFiles.add(runInfo);
        }
        ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                DImportConstant.Serialize(allFiles));
        for (LogHostRunInfoPO runInfo : allFiles) {
            if (procInfo.stoped)
                break;
            String fileName = runInfo.FILE_NAME;
            LOG.info("?:" + fileName);
            procInfo.RUN_LOG_ID = runInfo.RUN_LOG_ID;
            runInfo.START_TIME = new Date(System.currentTimeMillis());
            procInfo.processFile = fileName;
            String localFile = fileName;
            try {
                if (isInHdfs) {// 
                    localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1);
                }
                procInfo.task.TASK_COMMAND = runInfo.RUN_COMMAND;
                if (isInHdfs) {// 
                    File lf = new File(localFile);
                    if (lf.exists())
                        lf.delete();
                    FileSystem fs = FileSystem.get(HadoopConf.getConf(conf));
                    LOG.info("HDFS:" + fileName + "===>" + localFile);
                    long btime = System.currentTimeMillis();
                    fs.copyToLocalFile(new Path(fileName), new Path(localFile));
                    LOG.info("HDFS?:" + fileName + "===>" + localFile);
                    runInfo.downTime = System.currentTimeMillis() - btime;
                    fileName = localFile;
                }
                updateHostInfoLog(runInfo, allFiles);
                LOG.info(procInfo.task.TASK_NAME + " commandline: " + procInfo.task.TASK_COMMAND);
                procInfo.proc = execResult(runInfo.RUN_COMMAND);
                runInfo.IS_RUN_SUCCESS = 1;
                runInfo.RETURN_CODE = writeProcessLog(procInfo);
                LOG.info(procInfo.task.TASK_NAME + " return value: " + runInfo.RETURN_CODE);
                // runInfo.RETURN_CODE = procInfo.proc.exitValue();
            } catch (Throwable e) {
                runInfo.ERROR_MSG = e.getMessage();
                if (procInfo.proc != null) {
                    try {
                        procInfo.proc.destroy();
                    } catch (Exception ex) {
                    }
                }
                procInfo.proc = null;
                LOG.error("", e);
            } finally { // 
                runInfo.END_TIME = new Date(System.currentTimeMillis());
                master.logWriter.updateLog(runInfo);
                updateHostInfoLog(runInfo, allFiles);
                ZKUtil.createSetData(watcher,
                        watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                        DImportConstant.Serialize(allFiles));
                if (isInHdfs) {
                    File lf = new File(localFile);
                    if (lf.exists())
                        lf.delete();
                }
            }
        }
    } catch (Throwable e) {
        LOG.error("" + task, e);
        try {
            if (allFiles.size() > 0) {
                for (LogHostRunInfoPO logHostRunInfoPO : allFiles) {
                    if (logHostRunInfoPO.END_TIME.getTime() < 10000) {
                        logHostRunInfoPO.END_TIME = new Date(System.currentTimeMillis());
                        logHostRunInfoPO.IS_RUN_SUCCESS = 1;
                        logHostRunInfoPO.RETURN_CODE = 2;
                    }
                }
                ZKUtil.createSetData(watcher,
                        watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                        DImportConstant.Serialize(allFiles));
            }
        } catch (KeeperException e1) {
            LOG.error("update task run info on host :" + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/"
                    + master.hostName, e);
        } catch (IOException e1) {
            LOG.error("update task run info on host " + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/"
                    + master.hostName, e);
        }
    } finally { // 
        synchronized (taskInProgress) {
            taskInProgress.remove(task.getRunTaskId());
        }
    }
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorDCMImpl.java

License:Apache License

public static void HackMapreduce() throws Exception {

    DCMConstants.setFinalStatic(/*  w w w . j av a  2 s . c  o m*/
            org.apache.hadoop.mapreduce.lib.input.FileInputFormat.class.getDeclaredField("hiddenFileFilter"),
            new PathFilter() {
                public boolean accept(Path p) {
                    return true;
                }
            });
    DCMConstants.setFinalStatic(
            org.apache.hadoop.mapred.FileInputFormat.class.getDeclaredField("hiddenFileFilter"),
            new PathFilter() {
                public boolean accept(Path p) {
                    return true;
                }
            });
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HdfsSortedOplogOrganizer.java

License:Apache License

protected FileStatus[] getExpiryMarkers() throws IOException {
    FileSystem fs = store.getFileSystem();
    if (hoplogReadersController.hoplogs == null || hoplogReadersController.hoplogs.size() == 0) {
        // there are no hoplogs in the system. May be the bucket is not existing
        // at all.
        if (!fs.exists(bucketPath)) {
            if (logger.isDebugEnabled())
                logger.debug("{}This bucket is unused, skipping expired hoplog check", logPrefix);
            return null;
        }/*from ww  w. ja v a  2  s .c  om*/
    }

    FileStatus files[] = FSUtils.listStatus(fs, bucketPath, new PathFilter() {
        @Override
        public boolean accept(Path file) {
            // All expired hoplog end with expire extension and must match the valid file regex
            String fileName = file.getName();
            if (!fileName.endsWith(EXPIRED_HOPLOG_EXTENSION)) {
                return false;
            }
            fileName = truncateExpiryExtension(fileName);
            Matcher matcher = SORTED_HOPLOG_PATTERN.matcher(fileName);
            return matcher.find();
        }

    });
    return files;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java

License:Apache License

public HDFSUnsortedHoplogOrganizer(HdfsRegionManager region, int bucketId) throws IOException {
    super(region, bucketId);
    writer = null;//from   ww w .  ja  v a  2 s .  co m
    sequence = new AtomicInteger(0);

    fileSystem = store.getFileSystem();
    if (!fileSystem.exists(bucketPath)) {
        return;
    }

    FileStatus validHoplogs[] = FSUtils.listStatus(fileSystem, bucketPath, new PathFilter() {
        @Override
        public boolean accept(Path file) {
            // All valid hoplog files must match the regex
            Matcher matcher = HOPLOG_PATTERN.matcher(file.getName());
            return matcher.matches();
        }
    });

    if (validHoplogs != null && validHoplogs.length > 0) {
        for (FileStatus file : validHoplogs) {
            // account for the disk used by this file
            incrementDiskUsage(file.getLen());
        }
    }

}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java

License:Apache License

/**
 * Fixes the size of hoplogs that were not closed properly last time. 
 * Such hoplogs are *.tmphop files. Identify them and open them and close 
 * them, this fixes the size. After doing this rename them to *.hop. 
 * //  w  w  w.  j av a 2  s. c om
 * @throws IOException
 * @throws ForceReattemptException 
 */
void identifyAndFixTmpHoplogs(FileSystem fs) throws IOException, ForceReattemptException {
    if (logger.isDebugEnabled())
        logger.debug("{}Fixing temporary hoplogs", logPrefix);

    // A different filesystem is passed to this function for the following reason: 
    // For HDFS, if a file wasn't closed properly last time, 
    // while calling FileSystem.append for this file, FSNamesystem.startFileInternal->
    // FSNamesystem.recoverLeaseInternal function gets called. 
    // This function throws AlreadyBeingCreatedException if there is an open handle, to any other file, 
    // created using the same FileSystem object. This is a bug and is being tracked at: 
    // https://issues.apache.org/jira/browse/HDFS-3848?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
    // 
    // The fix for this bug is not yet part of Pivotal HD. So to overcome the bug, 
    // we create a new file system for the timer task so that it does not encounter the bug. 

    FileStatus tmpHoplogs[] = FSUtils.listStatus(fs, fs.makeQualified(bucketPath), new PathFilter() {
        @Override
        public boolean accept(Path file) {
            // All valid hoplog files must match the regex
            Matcher matcher = patternForTmpHoplog.matcher(file.getName());
            return matcher.matches();
        }
    });

    if (tmpHoplogs == null || tmpHoplogs.length == 0) {
        if (logger.isDebugEnabled())
            logger.debug("{}No files to fix", logPrefix);
        return;
    }
    // ping secondaries so that in case of split brain, no other vm has taken up 
    // as primary. #50110. 
    pingSecondaries();
    if (logger.isDebugEnabled())
        logger.debug("{}Files to fix " + tmpHoplogs.length, logPrefix);

    String currentHoplogName = null;
    // get the current hoplog name. We need to ignore current hoplog while fixing. 
    if (currentHoplog != null) {
        currentHoplogName = currentHoplog.getFileName();
    }

    for (int i = 0; i < tmpHoplogs.length; i++) {
        // Skip directories
        if (tmpHoplogs[i].isDirectory()) {
            continue;
        }

        final Path p = tmpHoplogs[i].getPath();

        if (tmpHoplogs[i].getPath().getName().equals(currentHoplogName)) {
            if (logger.isDebugEnabled())
                logger.debug("Skipping current file: " + tmpHoplogs[i].getPath().getName(), logPrefix);
            continue;
        }

        SequenceFileHoplog hoplog = new SequenceFileHoplog(fs, p, stats);
        try {
            makeLegitimate(hoplog);
            logger.info(LocalizedMessage.create(LocalizedStrings.DEBUG, "Hoplog " + p + " was a temporary "
                    + "hoplog because the node managing it wasn't shutdown properly last time. Fixed the hoplog name."));
        } catch (IOException e) {
            logger.info(LocalizedMessage.create(LocalizedStrings.DEBUG, "Hoplog " + p + " is still a temporary "
                    + "hoplog because the node managing it wasn't shutdown properly last time. Failed to "
                    + "change the hoplog name because an exception was thrown while fixing it. " + e));
        }
    }
}