Example usage for org.apache.hadoop.fs FileStatus isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus isDirectory.

Prototype

public boolean isDirectory()

Source Link

Document

Is this a directory?

Usage

From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java

License:Apache License

public static int addInputDirs(String[] inputs, Job job) throws IOException {
    int ret = 0;//return number of added folders
    for (String inp : inputs) {
        inp = inp.trim();//from  w  w  w .  ja  v  a 2  s. c o m
        if (inp.endsWith("/*")) {
            inp = inp.substring(0, inp.length() - 2);
            FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
            Path path = new Path(inp);

            if (!exists(fs, path)) {
                logger.warn("Path not exist:" + path.toString());
                continue;
            }

            FileStatus[] fileStatuses = fs.listStatus(path);
            boolean hasDir = false;
            for (FileStatus stat : fileStatuses) {
                if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                    hasDir = true;
                    ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
                }
            }
            if (fileStatuses.length > 0 && !hasDir) {
                ret += addInputDirs(new String[] { path.toString() }, job);
            }
        } else {
            logger.debug("Add input " + inp);
            FileInputFormat.addInputPath(job, new Path(inp));
            ret++;
        }
    }
    return ret;
}

From source file:org.apache.kylin.engine.spark.SparkUtil.java

License:Apache License

/**
 * Read the given path as a Java RDD; The path can have second level sub folder.
 * @param inputPath/*from  w  ww.  j  ava2 s. c o  m*/
 * @param fs
 * @param sc
 * @param keyClass
 * @param valueClass
 * @return
 * @throws IOException
 */
public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass);
    }

    return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass);
}

From source file:org.apache.kylin.job.hadoop.AbstractHadoopJob.java

License:Apache License

public void addInputDirs(String input, Job job) throws IOException {
    for (String inp : StringSplitter.split(input, ",")) {
        inp = inp.trim();//from  w  ww.j a va  2 s  .  com
        if (inp.endsWith("/*")) {
            inp = inp.substring(0, inp.length() - 2);
            FileSystem fs = FileSystem.get(job.getConfiguration());
            Path path = new Path(inp);
            FileStatus[] fileStatuses = fs.listStatus(path);
            boolean hasDir = false;
            for (FileStatus stat : fileStatuses) {
                if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                    hasDir = true;
                    addInputDirs(stat.getPath().toString(), job);
                }
            }
            if (fileStatuses.length > 0 && !hasDir) {
                addInputDirs(path.toString(), job);
            }
        } else {
            logger.debug("Add input " + inp);
            FileInputFormat.addInputPath(job, new Path(inp));
        }
    }
}

From source file:org.apache.kylin.source.hive.ITHiveTezUnionAllTest.java

License:Apache License

private void testMaterializeView(boolean isDistributeBy) throws Exception {
    KylinConfig config = getTestConfig();

    /**/*from   ww  w. j a v a  2s .  c o  m*/
     * For UT debug
     * config.setProperty("kylin.job.use-remote-cli", "true");
     */

    String viewName = "test_union_all_view";
    String tableName = "test_union_all_table";

    HiveCmdBuilder hiveCmdBuilder = new HiveCmdBuilder();
    JobEngineConfig jobConf = new JobEngineConfig(config);
    String storagePath = JobBuilderSupport.getJobWorkingDir(jobConf, "it-test") + "/" + tableName;

    StringBuilder testCmd = new StringBuilder();
    testCmd.append("USE " + config.getHiveDatabaseForIntermediateTable() + ";").append("\n");
    testCmd.append("SET hive.execution.engine=tez;");
    testCmd.append("DROP VIEW IF EXISTS " + viewName + ";\n");
    testCmd.append("CREATE VIEW " + viewName
            + " AS SELECT * FROM test_kylin_fact UNION ALL SELECT * FROM test_kylin_fact").append(";\n");
    testCmd.append("DROP TABLE IF EXISTS " + tableName + ";\n");
    testCmd.append("CREATE TABLE IF NOT EXISTS " + tableName + "\n");
    testCmd.append("LOCATION '" + storagePath + "'\n");
    testCmd.append("AS SELECT * FROM " + viewName + "\n");
    if (isDistributeBy)
        hiveCmdBuilder.addStatementWithRedistributeBy(testCmd);
    else
        hiveCmdBuilder.addStatement(testCmd.toString());

    Path rootPath = new Path(storagePath);
    FileSystem fs = HadoopUtil.getFileSystem(storagePath);

    fs.delete(rootPath, true);
    fs.mkdirs(rootPath);

    config.getCliCommandExecutor().execute(hiveCmdBuilder.build());

    rootPath = fs.makeQualified(rootPath);
    for (FileStatus statsFolder : fs.listStatus(rootPath)) {
        if (isDistributeBy)
            Assert.assertTrue(!statsFolder.isDirectory());
        else
            Assert.assertTrue(statsFolder.isDirectory());
    }

    HiveCmdBuilder cleanupCmdBuilder = new HiveCmdBuilder();
    StringBuilder cleanupCmd = new StringBuilder();
    cleanupCmd.append("USE " + config.getHiveDatabaseForIntermediateTable() + ";").append("\n");
    cleanupCmd.append("DROP VIEW IF EXISTS " + viewName + ";\n");
    cleanupCmd.append("DROP TABLE IF EXISTS " + tableName + ";\n");
    cleanupCmdBuilder.addStatement(cleanupCmd.toString());
    config.getCliCommandExecutor().execute(cleanupCmdBuilder.build());
    fs.delete(rootPath, true);
}

From source file:org.apache.lens.server.query.QueryResultPurger.java

License:Apache License

public void purgePaths(Path path, DateUtil.TimeDiff retention, boolean purgeDirectory) throws IOException {
    int counter = 0;
    FileSystem fs = path.getFileSystem(conf);
    FileStatus[] fileList = fs.listStatus(path);
    for (FileStatus f : fileList) {
        if ((f.isFile() || (f.isDirectory() && purgeDirectory)) && canBePurged(f, retention)) {
            try {
                if (fs.delete(f.getPath(), true)) {
                    counter++;/*from  ww w  .  j ava 2s .c o m*/
                } else {
                    getMetrics().incrCounter(this.getClass(), QUERY_RESULT_PURGER_ERROR_COUNTER);
                }
            } catch (IOException e) {
                getMetrics().incrCounter(this.getClass(), QUERY_RESULT_PURGER_ERROR_COUNTER);
            }
        }
    }
    log.info("Purged {} files/directories in {}", counter, path.toString());
}

From source file:org.apache.lens.server.query.TestQueryService.java

License:Apache License

/**
 * Read result set.//from  w w  w .  j a v a  2 s  . c o m
 *
 * @param resultset the resultset
 * @param handle    the handle
 * @param isDir     the is dir
 * @return the list
 * @throws IOException Signals that an I/O exception has occurred.
 */
public static List<String> readResultSet(PersistentQueryResult resultset, QueryHandle handle, boolean isDir)
        throws IOException {
    assertTrue(resultset.getPersistedURI().contains(handle.toString()));
    Path actualPath = new Path(resultset.getPersistedURI());
    FileSystem fs = actualPath.getFileSystem(new Configuration());
    List<String> actualRows = new ArrayList<>();
    if (fs.getFileStatus(actualPath).isDir()) {
        assertTrue(isDir);
        for (FileStatus fstat : fs.listStatus(actualPath)) {
            if (!fstat.isDirectory()) {
                addRowsFromFile(actualRows, fs, fstat.getPath());
            }
        }
    } else {
        assertFalse(isDir);
        addRowsFromFile(actualRows, fs, actualPath);
    }
    return actualRows;
}

From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java

License:Apache License

/** Queue "seed" documents.  Seed documents are the starting places for crawling activity.  Documents
* are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
*
* This method can choose to find repository changes that happen only during the specified time interval.
* The seeds recorded by this method will be viewed by the framework based on what the
* getConnectorModel() method returns.//from   w  ww  .jav a  2 s .c o  m
*
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
* The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
* For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
* When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
* seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
* The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
*@param seedTime is the end of the time range of documents to consider, exclusive.
*@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@return an updated seeding version string, to be stored with the job.
*/
@Override
public String addSeedDocuments(ISeedingActivity activities, Specification spec, String lastSeedVersion,
        long seedTime, int jobMode) throws ManifoldCFException, ServiceInterruption {

    String path = StringUtils.EMPTY;
    int i = 0;
    while (i < spec.getChildCount()) {
        SpecificationNode sn = spec.getChild(i);
        if (sn.getType().equals("startpoint")) {
            path = sn.getAttributeValue("path");

            FileStatus fileStatus = getObject(new Path(path));
            if (fileStatus.isDirectory()) {
                activities.addSeedDocument(fileStatus.getPath().toUri().toString());
            }
        }
        i++;
    }
    return "";
}

From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java

License:Apache License

/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
* The document specification allows this class to filter what is done based on the job.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
*@param statuses are the currently-stored document versions for each document in the set of document identifiers
* passed in above./*from w  ww . j av  a  2  s .c om*/
*@param activities is the interface this method should use to queue up new document references
* and ingest documents.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
*/
@Override
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
        IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
        throws ManifoldCFException, ServiceInterruption {

    for (String documentIdentifier : documentIdentifiers) {

        String versionString;

        FileStatus fileStatus = getObject(new Path(documentIdentifier));
        if (fileStatus != null) {

            boolean isDirectory = fileStatus.isDirectory();

            if (isDirectory) {
                // If HDFS directory modify dates are transitive, as they are on Unix,
                // then getting the modify date of the current version is sufficient
                // to detect any downstream changes we need to be aware of.
                // (If this turns out to be a bad assumption, this should simply set rval[i] ="").
                long lastModified = fileStatus.getModificationTime();
                versionString = new Long(lastModified).toString();

                if (activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) {
                    // Process directory!
                    String entityReference = documentIdentifier;
                    FileStatus[] fileStatuses = getChildren(fileStatus.getPath());
                    if (fileStatuses == null) {
                        continue;
                    }
                    for (int j = 0; j < fileStatuses.length; j++) {
                        FileStatus fs = fileStatuses[j++];
                        String canonicalPath = fs.getPath().toString();
                        if (checkInclude(session.getUri().toString(), fs, canonicalPath, spec)) {
                            activities.addDocumentReference(canonicalPath, documentIdentifier,
                                    RELATIONSHIP_CHILD);
                        }
                    }
                }
            } else {
                long lastModified = fileStatus.getModificationTime();
                StringBuilder sb = new StringBuilder();
                // Check if the path is to be converted.  We record that info in the version string so that we'll reindex documents whose
                // URI's change.
                String nameNode = nameNodeProtocol + "://" + nameNodeHost + ":" + nameNodePort;
                String convertPath = findConvertPath(nameNode, spec, fileStatus.getPath());
                if (convertPath != null) {
                    // Record the path.
                    sb.append("+");
                    pack(sb, convertPath, '+');
                } else
                    sb.append("-");
                sb.append(new Long(lastModified).toString());
                versionString = sb.toString();
                // We will record document fetch as an activity
                long startTime = System.currentTimeMillis();
                String errorCode = null;
                String errorDesc = null;
                long fileSize = 0;

                if (activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) {
                    // Process file!
                    if (!checkIngest(session.getUri().toString(), fileStatus, spec)) {
                        activities.noDocument(documentIdentifier, versionString);
                        continue;
                    }

                    // It is a file to be indexed.
                    long fileLength = fileStatus.getLen();
                    String fileName = fileStatus.getPath().getName();
                    String mimeType = mapExtensionToMimeType(fileStatus.getPath().getName());
                    Date modifiedDate = new Date(fileStatus.getModificationTime());
                    try {
                        String uri;
                        if (convertPath != null) {
                            uri = convertToWGETURI(convertPath);
                        } else {
                            uri = fileStatus.getPath().toUri().toString();
                        }

                        if (!activities.checkLengthIndexable(fileLength)) {
                            errorCode = activities.EXCLUDED_LENGTH;
                            errorDesc = "Excluding document because of file length ('" + fileLength + "')";
                            activities.noDocument(documentIdentifier, versionString);
                            continue;
                        }

                        if (!activities.checkURLIndexable(uri)) {
                            errorCode = activities.EXCLUDED_URL;
                            errorDesc = "Excluding document because of URL ('" + uri + "')";
                            activities.noDocument(documentIdentifier, versionString);
                            continue;
                        }

                        if (!activities.checkMimeTypeIndexable(mimeType)) {
                            errorCode = activities.EXCLUDED_MIMETYPE;
                            errorDesc = "Excluding document because of mime type (" + mimeType + ")";
                            activities.noDocument(documentIdentifier, versionString);
                            continue;
                        }

                        if (!activities.checkDateIndexable(modifiedDate)) {
                            errorCode = activities.EXCLUDED_DATE;
                            errorDesc = "Excluding document because of date (" + modifiedDate + ")";
                            activities.noDocument(documentIdentifier, versionString);
                            continue;
                        }

                        // Prepare the metadata part of RepositoryDocument
                        RepositoryDocument data = new RepositoryDocument();

                        data.setFileName(fileName);
                        data.setMimeType(mimeType);
                        data.setModifiedDate(modifiedDate);

                        data.addField("uri", uri);

                        BackgroundStreamThread t = new BackgroundStreamThread(getSession(),
                                new Path(documentIdentifier));
                        try {
                            t.start();
                            boolean wasInterrupted = false;
                            try {
                                InputStream is = t.getSafeInputStream();
                                try {
                                    data.setBinary(is, fileSize);
                                    activities.ingestDocumentWithException(documentIdentifier, versionString,
                                            uri, data);
                                } finally {
                                    is.close();
                                }
                            } catch (java.net.SocketTimeoutException e) {
                                throw e;
                            } catch (InterruptedIOException e) {
                                wasInterrupted = true;
                                throw e;
                            } catch (ManifoldCFException e) {
                                if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
                                    wasInterrupted = true;
                                }
                                throw e;
                            } finally {
                                if (!wasInterrupted) {
                                    // This does a join
                                    t.finishUp();
                                }
                            }

                            // No errors.  Record the fact that we made it.
                            errorCode = "OK";
                            // Length we did in bytes
                            fileSize = fileStatus.getLen();

                        } catch (InterruptedException e) {
                            // We were interrupted out of the join, most likely.  Before we abandon the thread,
                            // send a courtesy interrupt.
                            t.interrupt();
                            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                                    ManifoldCFException.INTERRUPTED);
                        } catch (java.net.SocketTimeoutException e) {
                            errorCode = "IOERROR";
                            errorDesc = e.getMessage();
                            handleIOException(e);
                        } catch (InterruptedIOException e) {
                            t.interrupt();
                            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                                    ManifoldCFException.INTERRUPTED);
                        } catch (IOException e) {
                            errorCode = "IOERROR";
                            errorDesc = e.getMessage();
                            handleIOException(e);
                        }
                    } finally {
                        if (errorCode != null) {
                            activities.recordActivity(new Long(startTime), ACTIVITY_READ, new Long(fileSize),
                                    documentIdentifier, errorCode, errorDesc, null);
                        }
                    }
                }
            }
        } else {
            activities.deleteDocument(documentIdentifier);
            continue;
        }
    }
}

From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java

License:Apache License

/** Check if a file or directory should be included, given a document specification.
 *@param fileName is the canonical file name.
 *@param documentSpecification is the specification.
 *@return true if it should be included.
 *//*from   w  w w.j  av a  2  s  .  c  om*/
protected static boolean checkInclude(String nameNode, FileStatus fileStatus, String fileName,
        Specification documentSpecification) throws ManifoldCFException {
    if (Logging.connectors.isDebugEnabled()) {
        Logging.connectors.debug("Checking whether to include file '" + fileName + "'");
    }

    String pathPart;
    String filePart;
    if (fileStatus.isDirectory()) {
        pathPart = fileName;
        filePart = null;
    } else {
        pathPart = fileStatus.getPath().getParent().toString();
        filePart = fileStatus.getPath().getName();
    }

    // Scan until we match a startpoint
    int i = 0;
    while (i < documentSpecification.getChildCount()) {
        SpecificationNode sn = documentSpecification.getChild(i++);
        if (sn.getType().equals("startpoint")) {
            String path = null;
            try {
                path = new URI(nameNode).resolve(sn.getAttributeValue("path")).toString();
            } catch (URISyntaxException e) {
                e.printStackTrace();
            }
            if (Logging.connectors.isDebugEnabled()) {
                Logging.connectors.debug("Checking path '" + path + "' against canonical '" + pathPart + "'");
            }
            // Compare with filename
            int matchEnd = matchSubPath(path, pathPart);
            if (matchEnd == -1) {
                if (Logging.connectors.isDebugEnabled()) {
                    Logging.connectors
                            .debug("Match check '" + path + "' against canonical '" + pathPart + "' failed");
                }

                continue;
            }
            // matchEnd is the start of the rest of the path (after the match) in fileName.
            // We need to walk through the rules and see whether it's in or out.
            int j = 0;
            while (j < sn.getChildCount()) {
                SpecificationNode node = sn.getChild(j++);
                String flavor = node.getType();
                String match = node.getAttributeValue("match");
                String type = node.getAttributeValue("type");
                // If type is "file", then our match string is against the filePart.
                // If filePart is null, then this rule is simply skipped.
                String sourceMatch;
                int sourceIndex;
                if (type.equals("file")) {
                    if (filePart == null) {
                        continue;
                    }
                    sourceMatch = filePart;
                    sourceIndex = 0;
                } else {
                    if (filePart != null) {
                        continue;
                    }
                    sourceMatch = pathPart;
                    sourceIndex = matchEnd;
                }

                if (flavor.equals("include")) {
                    if (checkMatch(sourceMatch, sourceIndex, match)) {
                        return true;
                    }
                } else if (flavor.equals("exclude")) {
                    if (checkMatch(sourceMatch, sourceIndex, match)) {
                        return false;
                    }
                }
            }
        }
    }
    if (Logging.connectors.isDebugEnabled()) {
        Logging.connectors.debug("Not including '" + fileName + "' because no matching rules");
    }

    return false;
}

From source file:org.apache.nifi.processors.hadoop.GetHDFS.java

License:Apache License

/**
 * Poll HDFS for files to process that match the configured file filters.
 *
 * @param hdfs hdfs// w  ww.  j  a v a2  s. c o  m
 * @param dir dir
 * @param filesVisited filesVisited
 * @return files to process
 * @throws java.io.IOException ex
 */
protected Set<Path> selectFiles(final FileSystem hdfs, final Path dir, Set<Path> filesVisited)
        throws IOException, InterruptedException {
    if (null == filesVisited) {
        filesVisited = new HashSet<>();
    }

    if (!hdfs.exists(dir)) {
        throw new IOException("Selection directory " + dir.toString() + " doesn't appear to exist!");
    }

    final Set<Path> files = new HashSet<>();

    FileStatus[] fileStatuses = getUserGroupInformation()
            .doAs((PrivilegedExceptionAction<FileStatus[]>) () -> hdfs.listStatus(dir));
    for (final FileStatus file : fileStatuses) {
        if (files.size() >= MAX_WORKING_QUEUE_SIZE) {
            // no need to make the files set larger than what we would queue anyway
            break;
        }

        final Path canonicalFile = file.getPath();

        if (!filesVisited.add(canonicalFile)) { // skip files we've already seen (may be looping directory links)
            continue;
        }

        if (file.isDirectory() && processorConfig.getRecurseSubdirs()) {
            files.addAll(selectFiles(hdfs, canonicalFile, filesVisited));

        } else if (!file.isDirectory() && processorConfig.getPathFilter(dir).accept(canonicalFile)) {
            final long fileAge = System.currentTimeMillis() - file.getModificationTime();
            if (processorConfig.getMinimumAge() < fileAge && fileAge < processorConfig.getMaximumAge()) {
                files.add(canonicalFile);

                if (getLogger().isDebugEnabled()) {
                    getLogger().debug(this + " selected file at path: " + canonicalFile.toString());
                }

            }
        }
    }
    return files;
}