Example usage for org.apache.hadoop.fs FileStatus isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus isDirectory.

Prototype

public boolean isDirectory()

Source Link

Document

Is this a directory?

Usage

From source file:com.blackberry.logdriver.admin.LogMaintenance.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // If run by Oozie, then load the Oozie conf too
    if (System.getProperty("oozie.action.conf.xml") != null) {
        conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml")));
    }//from w w  w . j  av a 2s .c om

    // For some reason, Oozie needs some options to be set in system instead of
    // in the confiuration. So copy the configs over.
    {
        Iterator<Entry<String, String>> i = conf.iterator();
        while (i.hasNext()) {
            Entry<String, String> next = i.next();
            System.setProperty(next.getKey(), next.getValue());
        }
    }

    if (args.length < 3) {
        printUsage();
        return 1;
    }

    String userName = args[0];
    String dcNumber = args[1];
    String service = args[2];
    String date = null;
    String hour = null;
    if (args.length >= 4) {
        date = args[3];
    }
    if (args.length >= 5) {
        hour = args[4];
    }

    // Set from environment variables
    String mergeJobPropertiesFile = getConfOrEnv(conf, "MERGEJOB_CONF");
    String filterJobPropertiesFile = getConfOrEnv(conf, "FILTERJOB_CONF");
    String daysBeforeArchive = getConfOrEnv(conf, "DAYS_BEFORE_ARCHIVE");
    String daysBeforeDelete = getConfOrEnv(conf, "DAYS_BEFORE_DELETE");
    String maxConcurrentMR = getConfOrEnv(conf, "MAX_CONCURRENT_MR", "-1");
    String zkConnectString = getConfOrEnv(conf, "ZK_CONNECT_STRING");
    String logdir = getConfOrEnv(conf, "logdriver.logdir.name");
    boolean resetOrphanedJobs = Boolean.parseBoolean(getConfOrEnv(conf, "reset.orphaned.jobs", "true"));
    String rootDir = getConfOrEnv(conf, "service.root.dir");
    String maxTotalMR = getConfOrEnv(conf, "MAX_TOTAL_MR", "-1");

    boolean doMerge = true;
    boolean doArchive = true;
    boolean doDelete = true;

    if (zkConnectString == null) {
        LOG.error("ZK_CONNECT_STRING is not set.  Exiting.");
        return 1;
    }
    if (mergeJobPropertiesFile == null) {
        LOG.info("MERGEJOB_CONF is not set.  Not merging.");
        doMerge = false;
    }
    if (filterJobPropertiesFile == null) {
        LOG.info("FILTERJOB_CONF is not set.  Not archiving.");
        doArchive = false;
    }
    if (daysBeforeArchive == null) {
        LOG.info("DAYS_BEFORE_ARCHIVE is not set.  Not archiving.");
        doArchive = false;
    }
    if (doArchive && Integer.parseInt(daysBeforeArchive) < 0) {
        LOG.info("DAYS_BEFORE_ARCHIVE is negative.  Not archiving.");
        doArchive = false;
    }
    if (daysBeforeDelete == null) {
        LOG.info("DAYS_BEFORE_DELETE is not set.  Not deleting.");
        doDelete = false;
    }
    if (doDelete && Integer.parseInt(daysBeforeDelete) < 0) {
        LOG.info("DAYS_BEFORE_DELETE is negative.  Not deleting.");
        doDelete = false;
    }
    if (logdir == null) {
        LOG.info("LOGDRIVER_LOGDIR_NAME is not set.  Using default value of 'logs'.");
        logdir = "logs";
    }
    if (rootDir == null) {
        LOG.info("SERVICE_ROOT_DIR is not set.  Using default value of 'service'.");
        rootDir = "/service";
    }

    // We can hang if this fails. So make sure we abort if it fails.
    fs = null;
    try {
        fs = FileSystem.get(conf);
        fs.exists(new Path("/")); // Test if it works.
    } catch (IOException e) {
        LOG.error("Error getting filesystem.", e);
        return 1;
    }

    // Create the LockUtil instance
    lockUtil = new LockUtil(zkConnectString);

    // Now it's safe to create our Job Runner
    JobRunner jobRunner = new JobRunner(Integer.parseInt(maxConcurrentMR), Integer.parseInt(maxTotalMR));
    Thread jobRunnerThread = new Thread(jobRunner);
    jobRunnerThread.setName("JobRunner");
    jobRunnerThread.setDaemon(false);
    jobRunnerThread.start();

    // Figure out what date we start filters on.
    String filterCutoffDate = "";
    if (doArchive) {
        Calendar cal = Calendar.getInstance();
        cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeArchive));
        filterCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR),
                (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY));
        LOG.info("Archiving logs from before {}", filterCutoffDate);
    }
    String deleteCutoffDate = "";
    if (doDelete) {
        Calendar cal = Calendar.getInstance();
        cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeDelete));
        deleteCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR),
                (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY));
        LOG.info("Deleting logs from before {}", deleteCutoffDate);
    }

    long now = System.currentTimeMillis();

    // Various exceptions have been popping up here. So make sure I catch them
    // all.
    try {

        // Patterns to recognize hour, day and incoming directories, so that they
        // can be processed.
        Pattern datePathPattern;
        Pattern hourPathPattern;
        Pattern incomingPathPattern;
        Pattern dataPathPattern;
        Pattern archivePathPattern;
        Pattern workingPathPattern;
        if (hour != null) {
            datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")");
            hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")");
            incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")/([^/]+)/incoming");
            dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")/([^/]+)/data");
            archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")/([^/]+)/archive");
            workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")/([^/]+)/working/([^/]+)_(\\d+)");
        } else if (date != null) {
            datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")");
            hourPathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})");
            incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date)
                    + ")/(\\d{2})/([^/]+)/incoming");
            dataPathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/data");
            archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date)
                    + ")/(\\d{2})/([^/]+)/archive");
            workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date)
                    + ")/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)");
        } else {
            datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})");
            hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})");
            incomingPathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/incoming");
            dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/data");
            archivePathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/archive");
            workingPathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)");
        }

        // Do a depth first search of the directory, processing anything that
        // looks
        // interesting along the way
        Deque<Path> paths = new ArrayDeque<Path>();
        Path rootPath = new Path(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/");
        paths.push(rootPath);

        while (paths.size() > 0) {
            Path p = paths.pop();
            LOG.debug("{}", p.toString());

            if (!fs.exists(p)) {
                continue;
            }

            FileStatus dirStatus = fs.getFileStatus(p);
            FileStatus[] children = fs.listStatus(p);
            boolean addChildren = true;

            boolean old = dirStatus.getModificationTime() < now - WAIT_TIME;
            LOG.debug("    Was last modified {}ms ago", now - dirStatus.getModificationTime());

            if (!old) {
                LOG.debug("    Skipping, since it's not old enough.");

            } else if ((!rootPath.equals(p)) && (children.length == 0
                    || (children.length == 1 && children[0].getPath().getName().equals(READY_MARKER)))) {
                // old and no children? Delete!
                LOG.info("    Deleting empty directory {}", p.toString());
                fs.delete(p, true);

            } else {
                Matcher matcher = datePathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    LOG.debug("Checking date directory");

                    // If this is already done, then skip it. So only process if it
                    // doesn't exist.
                    if (fs.exists(new Path(p, READY_MARKER)) == false) {
                        // Check each subdirectory. If they all have ready markers, then I
                        // guess we're ready.
                        boolean ready = true;
                        for (FileStatus c : children) {
                            if (c.isDirectory() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) {
                                ready = false;
                                break;
                            }
                        }

                        if (ready) {
                            fs.createNewFile(new Path(p, READY_MARKER));
                        }
                    }
                }

                matcher = hourPathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    LOG.debug("Checking hour directory");

                    // If this is already done, then skip it. So only process if it
                    // doesn't exist.
                    if (fs.exists(new Path(p, READY_MARKER)) == false) {
                        // Check each subdirectory. If they all have ready markers, then I
                        // guess we're ready.
                        boolean ready = true;
                        for (FileStatus c : children) {
                            if (c.isDirectory() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) {
                                ready = false;
                                break;
                            }
                        }

                        if (ready) {
                            fs.createNewFile(new Path(p, READY_MARKER));
                        }
                    }
                }

                // Check to see if we have to run a merge
                matcher = incomingPathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    LOG.debug("Checking incoming directory");
                    String matchDate = matcher.group(1);
                    String matchHour = matcher.group(2);
                    String matchComponent = matcher.group(3);

                    String timestamp = matchDate + matchHour;

                    if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) {
                        LOG.info("Deleting old directory: {}", p);
                        fs.delete(p, true);
                        addChildren = false;
                    } else if (doMerge) {

                        // old, looks right, and has children? Run it!
                        boolean hasMatchingChildren = false;
                        boolean subdirTooYoung = false;

                        for (FileStatus child : children) {
                            if (!hasMatchingChildren) {
                                FileStatus[] grandchildren = fs.listStatus(child.getPath());
                                for (FileStatus gc : grandchildren) {
                                    if (VALID_FILE.matcher(gc.getPath().getName()).matches()) {
                                        hasMatchingChildren = true;
                                        break;
                                    }
                                }
                            }
                            if (!subdirTooYoung) {
                                if (child.getModificationTime() >= now - WAIT_TIME) {
                                    subdirTooYoung = true;
                                    LOG.debug("    Subdir {} is too young.", child.getPath());
                                }
                            }
                        }

                        if (!hasMatchingChildren) {
                            LOG.debug("    No files match the expected pattern ({})", VALID_FILE.pattern());
                        }

                        if (hasMatchingChildren && !subdirTooYoung) {
                            LOG.info("    Run Merge job {} :: {} {} {} {} {}", new Object[] { p.toString(),
                                    dcNumber, service, matchDate, matchHour, matchComponent });

                            Properties jobProps = new Properties();
                            jobProps.load(new FileInputStream(mergeJobPropertiesFile));

                            jobProps.setProperty("jobType", "merge");
                            jobProps.setProperty("rootDir", rootDir);
                            jobProps.setProperty("dcNumber", dcNumber);
                            jobProps.setProperty("service", service);
                            jobProps.setProperty("date", matchDate);
                            jobProps.setProperty("hour", matchHour);
                            jobProps.setProperty("component", matchComponent);
                            jobProps.setProperty("user.name", userName);
                            jobProps.setProperty("logdir", logdir);

                            jobRunner.submit(jobProps);

                            addChildren = false;
                        }
                    }
                }

                // Check to see if we need to run a filter and archive
                matcher = dataPathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    String matchDate = matcher.group(1);
                    String matchHour = matcher.group(2);
                    String matchComponent = matcher.group(3);

                    String timestamp = matchDate + matchHour;

                    if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) {
                        LOG.info("Deleting old directory: {}", p);
                        fs.delete(p, true);
                        addChildren = false;
                    } else if (doArchive && timestamp.compareTo(filterCutoffDate) < 0) {

                        Properties jobProps = new Properties();
                        jobProps.load(new FileInputStream(filterJobPropertiesFile));

                        jobProps.setProperty("jobType", "filter");
                        jobProps.setProperty("rootDir", rootDir);
                        jobProps.setProperty("dcNumber", dcNumber);
                        jobProps.setProperty("service", service);
                        jobProps.setProperty("date", matchDate);
                        jobProps.setProperty("hour", matchHour);
                        jobProps.setProperty("component", matchComponent);
                        jobProps.setProperty("user.name", userName);
                        jobProps.setProperty("logdir", logdir);

                        // Check to see if we should just keep all or delete all here.
                        // The filter file should be here
                        String appPath = jobProps.getProperty("oozie.wf.application.path");
                        appPath = appPath.replaceFirst("\\$\\{.*?\\}", "");
                        Path filterFile = new Path(
                                appPath + "/" + conf.get("filter.definition.file", service + ".yaml"));
                        LOG.info("Filter file is {}", filterFile);
                        if (fs.exists(filterFile)) {
                            List<BoomFilterMapper.Filter> filters = BoomFilterMapper.loadFilters(matchComponent,
                                    fs.open(filterFile));

                            if (filters == null) {
                                LOG.warn(
                                        "    Got null when getting filters.  Not processing. {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                            } else if (filters.size() == 0) {
                                LOG.warn("    Got no filters.  Not processing. {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                            } else if (filters.size() == 1
                                    && filters.get(0) instanceof BoomFilterMapper.KeepAllFilter) {
                                LOG.info("    Keeping everything. {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                                // Move files from data to archive
                                // delete it all!
                                String destination = rootDir + "/" + dcNumber + "/" + service + "/" + logdir
                                        + "/" + matchDate + "/" + matchHour + "/" + matchComponent
                                        + "/archive/";

                                PathInfo pathInfo = new PathInfo();
                                pathInfo.setDcNumber(dcNumber);
                                pathInfo.setService(service);
                                pathInfo.setLogdir(logdir);
                                pathInfo.setDate(matchDate);
                                pathInfo.setHour(matchHour);
                                pathInfo.setComponent(matchComponent);

                                try {
                                    lockUtil.acquireWriteLock(lockUtil.getLockPath(pathInfo));
                                    fs.mkdirs(new Path(destination));
                                    for (FileStatus f : fs.listStatus(p)) {
                                        fs.rename(f.getPath(), new Path(destination));
                                    }
                                } finally {
                                    lockUtil.releaseWriteLock(lockUtil.getLockPath(pathInfo));
                                }
                            } else if (filters.size() == 1
                                    && filters.get(0) instanceof BoomFilterMapper.DropAllFilter) {
                                LOG.info("    Dropping everything. {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });

                                PathInfo pathInfo = new PathInfo();
                                pathInfo.setDcNumber(dcNumber);
                                pathInfo.setService(service);
                                pathInfo.setLogdir(logdir);
                                pathInfo.setDate(matchDate);
                                pathInfo.setHour(matchHour);
                                pathInfo.setComponent(matchComponent);

                                try {
                                    lockUtil.acquireWriteLock(lockUtil.getLockPath(pathInfo));
                                    fs.delete(p, true);
                                } finally {
                                    lockUtil.releaseWriteLock(lockUtil.getLockPath(pathInfo));
                                }

                            } else {
                                LOG.info("    Run Filter/Archive job {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                                jobRunner.submit(jobProps);
                            }
                        } else {
                            LOG.warn("Skipping filter job, since no filter file exists");
                        }

                        addChildren = false;
                    }
                }

                matcher = archivePathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    String matchDate = matcher.group(1);
                    String matchHour = matcher.group(2);

                    String timestamp = matchDate + matchHour;

                    if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) {
                        LOG.info("Deleting old directory: {}", p);
                        fs.delete(p, true);
                        addChildren = false;
                    }
                }

                matcher = workingPathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    LOG.info("  Matches working pattern ({})", p);
                    if (resetOrphanedJobs) {
                        String matchDate = matcher.group(1);
                        String matchHour = matcher.group(2);
                        String matchComponent = matcher.group(3);

                        // Move everything from working/xxx/incoming/ to incoming/
                        PathInfo lockPathInfo = new PathInfo(logdir, rootDir + "/" + dcNumber + "/" + service
                                + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent);
                        lockUtil.acquireWriteLock(lockUtil.getLockPath(lockPathInfo));

                        FileStatus[] fileStatuses = fs.listStatus(new Path(p.toUri().getPath() + "/incoming/"));
                        if (fileStatuses != null) {
                            for (FileStatus fileStatus : fileStatuses) {
                                Path toPath = new Path(
                                        fileStatus.getPath().getParent().getParent().getParent().getParent(),
                                        "incoming/" + fileStatus.getPath().getName());

                                LOG.info("  Moving data from {} to {}", fileStatus.getPath(), toPath);
                                LOG.info("    mkdir {}", toPath);
                                fs.mkdirs(toPath);

                                Path fromDir = new Path(p.toUri().getPath(),
                                        "incoming/" + fileStatus.getPath().getName());
                                LOG.info("    moving from {}", fromDir);
                                FileStatus[] files = fs.listStatus(fromDir);
                                if (files == null || files.length == 0) {
                                    LOG.info("    Nothing to move from  {}", fromDir);
                                } else {
                                    for (FileStatus f : files) {
                                        LOG.info("    rename {} {}", f.getPath(),
                                                new Path(toPath, f.getPath().getName()));
                                        fs.rename(f.getPath(), new Path(toPath, f.getPath().getName()));
                                    }
                                }

                                LOG.info("    rm {}", fileStatus.getPath());
                                fs.delete(fileStatus.getPath(), true);
                            }
                            lockUtil.releaseWriteLock(lockUtil.getLockPath(lockPathInfo));

                            fs.delete(new Path(p.toUri().getPath()), true);
                        }
                    }

                    addChildren = false;
                }
            }

            // Add any children which are directories to the stack.
            if (addChildren) {
                for (int i = children.length - 1; i >= 0; i--) {
                    FileStatus child = children[i];
                    if (child.isDirectory()) {
                        paths.push(child.getPath());
                    }
                }
            }
        }

        // Since we may have deleted a bunch of directories, delete any unused
        // locks
        // from ZooKeeper.
        {
            LOG.info("Checking for unused locks in ZooKeeper");
            String scanPath = rootDir + "/" + dcNumber + "/" + service + "/" + logdir;
            if (date != null) {
                scanPath += "/" + date;
                if (hour != null) {
                    scanPath += "/" + hour;
                }
            }

            List<LockInfo> lockInfo = lockUtil.scan(scanPath);

            for (LockInfo li : lockInfo) {
                // Check if the lock path still exists in HDFS. If it doesn't, then
                // delete it from ZooKeeper.
                String path = li.getPath();
                String hdfsPath = path.substring(LockUtil.ROOT.length());
                if (!fs.exists(new Path(hdfsPath))) {
                    ZooKeeper zk = lockUtil.getZkClient();

                    while (!path.equals(LockUtil.ROOT)) {
                        try {
                            zk.delete(path, -1);
                        } catch (KeeperException.NotEmptyException e) {
                            // That's fine. just stop trying then.
                            break;
                        } catch (Exception e) {
                            LOG.error("Caught exception trying to delete from ZooKeeper.", e);
                            break;
                        }
                        LOG.info("Deleted from ZooKeeper: {}", path);
                        path = path.substring(0, path.lastIndexOf('/'));
                    }

                }
            }
        }

        // Now that we're done, wait for the Oozie Runner to stop, and print the
        // results.
        LOG.info("Waiting for Oozie jobs to complete.");
        jobRunner.shutdown();
        jobRunnerThread.join();
        LOG.info("Job Stats : Started={} Succeeded={} failed={} errors={}",
                new Object[] { jobRunner.getStarted(), jobRunner.getSucceeded(), jobRunner.getFailed(),
                        jobRunner.getErrors() });

        lockUtil.close();

    } catch (Exception e) {
        LOG.error("Unexpected exception caught.", e);
        return 1;
    }

    return 0;
}

From source file:com.cloudera.cdk.data.filesystem.FileSystemDataset.java

License:Apache License

@Deprecated
void accumulateDatafilePaths(Path directory, List<Path> paths) throws IOException {

    for (FileStatus status : fileSystem.listStatus(directory, PathFilters.notHidden())) {

        if (status.isDirectory()) {
            accumulateDatafilePaths(status.getPath(), paths);
        } else {//from   w  w w  .  j  a  va2  s  .  c  o  m
            paths.add(status.getPath());
        }
    }
}

From source file:com.cloudera.cdk.data.filesystem.FileSystemMetadataProvider.java

License:Apache License

@Override
public List<String> list() {
    List<String> datasets = Lists.newArrayList();
    try {//from   w w w  .ja v a2s .c o  m
        FileStatus[] entries = rootFileSystem.listStatus(rootDirectory, PathFilters.notHidden());
        for (FileStatus entry : entries) {
            // assumes that all unhidden directories under the root are data sets
            if (entry.isDirectory() && rootFileSystem.exists(new Path(entry.getPath(), ".metadata"))) {
                // may want to add a check: !RESERVED_NAMES.contains(name)
                datasets.add(entry.getPath().getName());
            } else {
                continue;
            }
        }
    } catch (FileNotFoundException ex) {
        // the repo hasn't created any files yet
        return datasets;
    } catch (IOException ex) {
        throw new MetadataProviderException("Could not list data sets", ex);
    }
    return datasets;
}

From source file:com.cloudera.data.filesystem.FileSystemDataset.java

License:Apache License

private void accumulateDatafilePaths(Path directory, List<Path> paths) throws IOException {

    for (FileStatus status : fileSystem.listStatus(directory, PathFilters.notHidden())) {

        if (status.isDirectory()) {
            accumulateDatafilePaths(status.getPath(), paths);
        } else {//from  ww  w .  j  a v a 2 s  .co m
            paths.add(status.getPath());
        }
    }
}

From source file:com.cloudera.impala.catalog.HdfsTable.java

License:Apache License

/**
 * Loads the file block metadata for the given collection of FileDescriptors.  The
 * FileDescriptors are passed as a tree, where the first level is indexed by
 * filesystem, the second level is indexed by partition location, and the leaves are
 * the list of files that exist under each directory.
 */// w  w  w.j av  a2s. co  m
private void loadBlockMd(Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescs) throws RuntimeException {
    Preconditions.checkNotNull(perFsFileDescs);
    LOG.debug("load block md for " + name_);

    for (FsKey fsEntry : perFsFileDescs.keySet()) {
        FileSystem fs = fsEntry.filesystem;
        // Store all BlockLocations so they can be reused when loading the disk IDs.
        List<BlockLocation> blockLocations = Lists.newArrayList();
        int numCachedBlocks = 0;
        Map<String, List<FileDescriptor>> partitionToFds = perFsFileDescs.get(fsEntry);
        Preconditions.checkNotNull(partitionToFds);
        // loop over all files and record their block metadata, minus volume ids
        for (String partitionDir : partitionToFds.keySet()) {
            Path partDirPath = new Path(partitionDir);
            for (FileDescriptor fileDescriptor : partitionToFds.get(partitionDir)) {
                Path p = new Path(partDirPath, fileDescriptor.getFileName());
                try {
                    FileStatus fileStatus = fs.getFileStatus(p);
                    // fileDescriptors should not contain directories.
                    Preconditions.checkArgument(!fileStatus.isDirectory());
                    BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
                    Preconditions.checkNotNull(locations);
                    blockLocations.addAll(Arrays.asList(locations));

                    // Loop over all blocks in the file.
                    for (BlockLocation block : locations) {
                        String[] blockHostPorts = block.getNames();
                        try {
                            blockHostPorts = block.getNames();
                        } catch (IOException e) {
                            // this shouldn't happen, getNames() doesn't throw anything
                            String errorMsg = "BlockLocation.getNames() failed:\n" + e.getMessage();
                            LOG.error(errorMsg);
                            throw new IllegalStateException(errorMsg);
                        }
                        // Now enumerate all replicas of the block, adding any unknown hosts to
                        // hostIndex_ and the index for that host to replicaHostIdxs.
                        List<Integer> replicaHostIdxs = new ArrayList<Integer>(blockHostPorts.length);
                        for (int i = 0; i < blockHostPorts.length; ++i) {
                            String[] ip_port = blockHostPorts[i].split(":");
                            Preconditions.checkState(ip_port.length == 2);
                            TNetworkAddress network_address = new TNetworkAddress(ip_port[0],
                                    Integer.parseInt(ip_port[1]));
                            replicaHostIdxs.add(hostIndex_.getIndex(network_address));
                        }
                        fileDescriptor.addFileBlock(
                                new FileBlock(block.getOffset(), block.getLength(), replicaHostIdxs));
                    }
                } catch (IOException e) {
                    throw new RuntimeException(
                            "couldn't determine block locations for path '" + p + "':\n" + e.getMessage(), e);
                }
            }
        }

        if (SUPPORTS_VOLUME_ID && fs instanceof DistributedFileSystem) {
            LOG.trace("loading disk ids for: " + getFullName() + ". nodes: " + getNumNodes() + ". file system: "
                    + fsEntry);
            loadDiskIds((DistributedFileSystem) fs, blockLocations, partitionToFds);
            LOG.trace("completed load of disk ids for: " + getFullName());
        }
    }
}

From source file:com.cloudera.impala.catalog.HdfsTable.java

License:Apache License

/**
 * Creates a new HdfsPartition object to be added to the internal partition list.
 * Populates with file format information and file locations. Partitions may be empty,
 * or may not even exist on the file system (a partition's location may have been
 * changed to a new path that is about to be created by an INSERT). For unchanged
 * files (indicated by unchanged mtime), reuses the FileDescriptor from the
 * oldFileDescMap. The one exception is if the partition is marked as cached
 * in which case the block metadata cannot be reused. Otherwise, creates a new
 * FileDescriptor for each modified or new file and adds it to newFileDescMap.
 * Both old and newFileDescMap are Maps of parent directory (partition location)
 * to list of files (FileDescriptors) under that directory.
 * Returns new partition if successful or null if none was added.
 * Separated from addPartition to reduce the number of operations done
 * while holding the lock on the hdfs table.
        //from w  ww. j av  a  2 s  . c  o m
 *  @throws CatalogException
 *    if the supplied storage descriptor contains metadata that Impala can't
 *    understand.
 */
private HdfsPartition createPartition(StorageDescriptor storageDescriptor,
        org.apache.hadoop.hive.metastore.api.Partition msPartition,
        Map<String, List<FileDescriptor>> oldFileDescMap,
        Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescMap) throws CatalogException {
    HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_,
            storageDescriptor);
    Path partDirPath = new Path(storageDescriptor.getLocation());
    List<FileDescriptor> fileDescriptors = Lists.newArrayList();
    // If the partition is marked as cached, the block location metadata must be
    // reloaded, even if the file times have not changed.
    boolean isMarkedCached = isMarkedCached_;
    List<LiteralExpr> keyValues = Lists.newArrayList();
    if (msPartition != null) {
        isMarkedCached = HdfsCachingUtil.getCacheDirIdFromParams(msPartition.getParameters()) != null;
        // Load key values
        for (String partitionKey : msPartition.getValues()) {
            Type type = getColumns().get(keyValues.size()).getType();
            // Deal with Hive's special NULL partition key.
            if (partitionKey.equals(nullPartitionKeyValue_)) {
                keyValues.add(NullLiteral.create(type));
            } else {
                try {
                    keyValues.add(LiteralExpr.create(partitionKey, type));
                } catch (Exception ex) {
                    LOG.warn("Failed to create literal expression of type: " + type, ex);
                    throw new CatalogException("Invalid partition key value of type: " + type, ex);
                }
            }
        }
        try {
            Expr.analyze(keyValues, null);
        } catch (AnalysisException e) {
            // should never happen
            throw new IllegalStateException(e);
        }
    }
    try {
        // Each partition could reside on a different filesystem.
        FileSystem fs = partDirPath.getFileSystem(CONF);
        multipleFileSystems_ = multipleFileSystems_
                || !FileSystemUtil.isPathOnFileSystem(new Path(getLocation()), fs);
        if (fs.exists(partDirPath)) {
            // FileSystem does not have an API that takes in a timestamp and returns a list
            // of files that has been added/changed since. Therefore, we are calling
            // fs.listStatus() to list all the files.
            for (FileStatus fileStatus : fs.listStatus(partDirPath)) {
                String fileName = fileStatus.getPath().getName().toString();
                if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName)
                        || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) {
                    // Ignore directory, hidden file starting with . or _, and LZO index files
                    // If a directory is erroneously created as a subdirectory of a partition dir
                    // we should ignore it and move on. Hive will not recurse into directories.
                    // Skip index files, these are read by the LZO scanner directly.
                    continue;
                }

                String partitionDir = fileStatus.getPath().getParent().toString();
                FileDescriptor fd = null;
                // Search for a FileDescriptor with the same partition dir and file name. If one
                // is found, it will be chosen as a candidate to reuse.
                if (oldFileDescMap != null && oldFileDescMap.get(partitionDir) != null) {
                    for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) {
                        if (oldFileDesc.getFileName().equals(fileName)) {
                            fd = oldFileDesc;
                            break;
                        }
                    }
                }

                // Check if this FileDescriptor has been modified since last loading its block
                // location information. If it has not been changed, the previously loaded
                // value can be reused.
                if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen()
                        || fd.getModificationTime() != fileStatus.getModificationTime()) {
                    // Create a new file descriptor, the block metadata will be populated by
                    // loadBlockMd.
                    fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime());
                    addPerFsFileDesc(perFsFileDescMap, fs, partitionDir, fd);
                }

                List<FileDescriptor> fds = fileDescMap_.get(partitionDir);
                if (fds == null) {
                    fds = Lists.newArrayList();
                    fileDescMap_.put(partitionDir, fds);
                }
                fds.add(fd);

                // Add to the list of FileDescriptors for this partition.
                fileDescriptors.add(fd);
            }
            numHdfsFiles_ += fileDescriptors.size();
        }
        HdfsPartition partition = new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor,
                fileDescriptors, getAvailableAccessLevel(fs, partDirPath));
        partition.checkWellFormed();
        return partition;
    } catch (Exception e) {
        throw new CatalogException("Failed to create partition: ", e);
    }
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Moves all visible (non-hidden) files from a source directory to a destination
 * directory. Any sub-directories within the source directory are skipped.
 * Returns the number of files moved as part of this operation.
 *///from w ww.j a  v  a2s.co  m
public static int moveAllVisibleFiles(Path sourceDir, Path destDir) throws IOException {
    FileSystem fs = destDir.getFileSystem(CONF);
    Preconditions.checkState(fs.isDirectory(destDir));
    Preconditions.checkState(fs.isDirectory(sourceDir));

    // Use the same UUID to resolve all file name conflicts. This helps mitigate problems
    // that might happen if there is a conflict moving a set of files that have
    // dependent file names. For example, foo.lzo and foo.lzo_index.
    UUID uuid = UUID.randomUUID();

    // Enumerate all the files in the source
    int numFilesMoved = 0;
    for (FileStatus fStatus : fs.listStatus(sourceDir)) {
        if (fStatus.isDirectory()) {
            LOG.debug("Skipping copy of directory: " + fStatus.getPath());
            continue;
        } else if (isHiddenFile(fStatus.getPath().getName())) {
            continue;
        }

        Path destFile = new Path(destDir, fStatus.getPath().getName());
        if (fs.exists(destFile)) {
            destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), uuid.toString()));
        }
        FileSystemUtil.moveFile(fStatus.getPath(), destFile, false);
        ++numFilesMoved;
    }
    return numFilesMoved;
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Returns true if the given Path contains any sub directories, otherwise false.
 *///  w w  w.j  a  v a2s  .  co m
public static boolean containsSubdirectory(Path directory) throws FileNotFoundException, IOException {
    FileSystem fs = directory.getFileSystem(CONF);
    // Enumerate all the files in the source
    for (FileStatus fStatus : fs.listStatus(directory)) {
        if (fStatus.isDirectory()) {
            return true;
        }
    }
    return false;
}

From source file:com.cloudera.impala.util.LoadMetadataUtil.java

License:Apache License

/**
 * Get file descriptor according to fileStatus and oldFileDescMap. It will return null
 * if the file is a directory, or hidden file starting with . or _, or LZO index files.
 * If the file can be found in the old File description map and not modified, and not
 * 'isMarkedCached' - partition marked as cached, just reuse the one in cache. Otherwise
 * it will create a new File description with filename, file length and modification
 * time./*  w  ww  .  j  a  v  a2  s.  com*/
 *
 * Must be thread safe. Access to 'oldFileDescMap', 'perFsFileBlocks' and 'hostIndex'
 * must be protected.
 */
private static FileDescriptor getFileDescriptor(FileSystem fs, FileStatus fileStatus, HdfsFileFormat fileFormat,
        Map<String, List<FileDescriptor>> oldFileDescMap, boolean isMarkedCached,
        Map<FsKey, FileBlocksInfo> perFsFileBlocks, String tblName, ListMap<TNetworkAddress> hostIndex) {
    String fileName = fileStatus.getPath().getName().toString();

    if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName)
            || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) {
        // Ignore directory, hidden file starting with . or _, and LZO index files
        // If a directory is erroneously created as a subdirectory of a partition dir
        // we should ignore it and move on. Hive will not recurse into directories.
        // Skip index files, these are read by the LZO scanner directly.
        return null;
    }

    String partitionDir = fileStatus.getPath().getParent().toString();
    FileDescriptor fd = null;
    // Search for a FileDescriptor with the same partition dir and file name. If one
    // is found, it will be chosen as a candidate to reuse.
    if (oldFileDescMap != null) {
        synchronized (oldFileDescMap) {
            if (oldFileDescMap.get(partitionDir) != null) {
                for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) {
                    // TODO: This doesn't seem like the right data structure if a directory has
                    // a lot of files.
                    if (oldFileDesc.getFileName().equals(fileName)) {
                        fd = oldFileDesc;
                        break;
                    }
                }
            }
        }
    }

    // Check if this FileDescriptor has been modified since last loading its block
    // location information. If it has not been changed, the previously loaded value can
    // be reused.
    if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen()
            || fd.getModificationTime() != fileStatus.getModificationTime()) {
        // Create a new file descriptor and load the file block metadata,
        // collecting the block metadata into perFsFileBlocks.  The disk IDs for
        // all the blocks of each filesystem will be loaded by loadDiskIds().
        fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime());
        loadBlockMetadata(fs, fileStatus, fd, fileFormat, perFsFileBlocks, tblName, hostIndex);
    }

    return fd;
}

From source file:com.cloudera.impala.util.LoadMetadataUtil.java

License:Apache License

/**
 * Queries the filesystem to load the file block metadata (e.g. DFS blocks) for the
 * given file. Adds the newly created block metadata and block location to the
 * perFsFileBlocks, so that the storage IDs for each block can be retrieved from
 * BlockLocation./*from  ww w .  ja va2 s. c  o  m*/
 *
 * Must be threadsafe. Access to 'perFsFileBlocks' and 'hostIndex' must be protected.
 */
private static void loadBlockMetadata(FileSystem fs, FileStatus file, FileDescriptor fd,
        HdfsFileFormat fileFormat, Map<FsKey, FileBlocksInfo> perFsFileBlocks, String tblName,
        ListMap<TNetworkAddress> hostIndex) {
    Preconditions.checkNotNull(fd);
    Preconditions.checkNotNull(perFsFileBlocks);
    Preconditions.checkArgument(!file.isDirectory());
    LOG.debug("load block md for " + tblName + " file " + fd.getFileName());

    if (!FileSystemUtil.hasGetFileBlockLocations(fs)) {
        synthesizeBlockMetadata(file, fd, fileFormat, hostIndex);
        return;
    }
    try {
        BlockLocation[] locations = null;
        if (file instanceof LocatedFileStatus) {
            locations = ((LocatedFileStatus) file).getBlockLocations();
        } else {
            locations = fs.getFileBlockLocations(file, 0, file.getLen());
        }
        Preconditions.checkNotNull(locations);

        // Loop over all blocks in the file.
        for (BlockLocation loc : locations) {
            Preconditions.checkNotNull(loc);
            fd.addFileBlock(createFileBlock(loc, hostIndex));
        }

        // Remember the THdfsFileBlocks and corresponding BlockLocations. Once all the
        // blocks are collected, the disk IDs will be queried in one batch per filesystem.
        FsKey fsKey = new FsKey(fs);
        synchronized (perFsFileBlocks) {
            FileBlocksInfo infos = perFsFileBlocks.get(fsKey);
            if (infos == null) {
                infos = new FileBlocksInfo();
                perFsFileBlocks.put(fsKey, infos);
            }
            infos.addBlocks(fd.getFileBlocks(), Arrays.asList(locations));
        }
    } catch (IOException e) {
        throw new RuntimeException(
                "couldn't determine block locations for path '" + file.getPath() + "':\n" + e.getMessage(), e);
    }
}