Example usage for org.apache.hadoop.fs FileStatus isDirectory

List of usage examples for org.apache.hadoop.fs FileStatus isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus isDirectory.

Prototype

public boolean isDirectory() 

Source Link

Document

Is this a directory?

Usage

From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java

License:Apache License

public static int addInputDirs(String[] inputs, Job job) throws IOException {
    int ret = 0;//return number of added folders
    for (String inp : inputs) {
        inp = inp.trim();//from  w  w  w .  ja  v  a 2  s. c o m
        if (inp.endsWith("/*")) {
            inp = inp.substring(0, inp.length() - 2);
            FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
            Path path = new Path(inp);

            if (!exists(fs, path)) {
                logger.warn("Path not exist:" + path.toString());
                continue;
            }

            FileStatus[] fileStatuses = fs.listStatus(path);
            boolean hasDir = false;
            for (FileStatus stat : fileStatuses) {
                if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                    hasDir = true;
                    ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
                }
            }
            if (fileStatuses.length > 0 && !hasDir) {
                ret += addInputDirs(new String[] { path.toString() }, job);
            }
        } else {
            logger.debug("Add input " + inp);
            FileInputFormat.addInputPath(job, new Path(inp));
            ret++;
        }
    }
    return ret;
}

From source file:org.apache.kylin.engine.spark.SparkUtil.java

License:Apache License

/**
 * Read the given path as a Java RDD; The path can have second level sub folder.
 * @param inputPath/*from  w  ww.  j  ava2 s. c o  m*/
 * @param fs
 * @param sc
 * @param keyClass
 * @param valueClass
 * @return
 * @throws IOException
 */
public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass);
    }

    return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass);
}

From source file:org.apache.kylin.job.hadoop.AbstractHadoopJob.java

License:Apache License

public void addInputDirs(String input, Job job) throws IOException {
    for (String inp : StringSplitter.split(input, ",")) {
        inp = inp.trim();//from  w  ww.j a va  2 s  .  com
        if (inp.endsWith("/*")) {
            inp = inp.substring(0, inp.length() - 2);
            FileSystem fs = FileSystem.get(job.getConfiguration());
            Path path = new Path(inp);
            FileStatus[] fileStatuses = fs.listStatus(path);
            boolean hasDir = false;
            for (FileStatus stat : fileStatuses) {
                if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                    hasDir = true;
                    addInputDirs(stat.getPath().toString(), job);
                }
            }
            if (fileStatuses.length > 0 && !hasDir) {
                addInputDirs(path.toString(), job);
            }
        } else {
            logger.debug("Add input " + inp);
            FileInputFormat.addInputPath(job, new Path(inp));
        }
    }
}

From source file:org.apache.kylin.source.hive.ITHiveTezUnionAllTest.java

License:Apache License

private void testMaterializeView(boolean isDistributeBy) throws Exception {
    KylinConfig config = getTestConfig();

    /**/*from   ww  w. j a v a  2s .  c o  m*/
     * For UT debug
     * config.setProperty("kylin.job.use-remote-cli", "true");
     */

    String viewName = "test_union_all_view";
    String tableName = "test_union_all_table";

    HiveCmdBuilder hiveCmdBuilder = new HiveCmdBuilder();
    JobEngineConfig jobConf = new JobEngineConfig(config);
    String storagePath = JobBuilderSupport.getJobWorkingDir(jobConf, "it-test") + "/" + tableName;

    StringBuilder testCmd = new StringBuilder();
    testCmd.append("USE " + config.getHiveDatabaseForIntermediateTable() + ";").append("\n");
    testCmd.append("SET hive.execution.engine=tez;");
    testCmd.append("DROP VIEW IF EXISTS " + viewName + ";\n");
    testCmd.append("CREATE VIEW " + viewName
            + " AS SELECT * FROM test_kylin_fact UNION ALL SELECT * FROM test_kylin_fact").append(";\n");
    testCmd.append("DROP TABLE IF EXISTS " + tableName + ";\n");
    testCmd.append("CREATE TABLE IF NOT EXISTS " + tableName + "\n");
    testCmd.append("LOCATION '" + storagePath + "'\n");
    testCmd.append("AS SELECT * FROM " + viewName + "\n");
    if (isDistributeBy)
        hiveCmdBuilder.addStatementWithRedistributeBy(testCmd);
    else
        hiveCmdBuilder.addStatement(testCmd.toString());

    Path rootPath = new Path(storagePath);
    FileSystem fs = HadoopUtil.getFileSystem(storagePath);

    fs.delete(rootPath, true);
    fs.mkdirs(rootPath);

    config.getCliCommandExecutor().execute(hiveCmdBuilder.build());

    rootPath = fs.makeQualified(rootPath);
    for (FileStatus statsFolder : fs.listStatus(rootPath)) {
        if (isDistributeBy)
            Assert.assertTrue(!statsFolder.isDirectory());
        else
            Assert.assertTrue(statsFolder.isDirectory());
    }

    HiveCmdBuilder cleanupCmdBuilder = new HiveCmdBuilder();
    StringBuilder cleanupCmd = new StringBuilder();
    cleanupCmd.append("USE " + config.getHiveDatabaseForIntermediateTable() + ";").append("\n");
    cleanupCmd.append("DROP VIEW IF EXISTS " + viewName + ";\n");
    cleanupCmd.append("DROP TABLE IF EXISTS " + tableName + ";\n");
    cleanupCmdBuilder.addStatement(cleanupCmd.toString());
    config.getCliCommandExecutor().execute(cleanupCmdBuilder.build());
    fs.delete(rootPath, true);
}

From source file:org.apache.lens.server.query.QueryResultPurger.java

License:Apache License

public void purgePaths(Path path, DateUtil.TimeDiff retention, boolean purgeDirectory) throws IOException {
    int counter = 0;
    FileSystem fs = path.getFileSystem(conf);
    FileStatus[] fileList = fs.listStatus(path);
    for (FileStatus f : fileList) {
        if ((f.isFile() || (f.isDirectory() && purgeDirectory)) && canBePurged(f, retention)) {
            try {
                if (fs.delete(f.getPath(), true)) {
                    counter++;/*from  ww w  .  j ava 2s .c o m*/
                } else {
                    getMetrics().incrCounter(this.getClass(), QUERY_RESULT_PURGER_ERROR_COUNTER);
                }
            } catch (IOException e) {
                getMetrics().incrCounter(this.getClass(), QUERY_RESULT_PURGER_ERROR_COUNTER);
            }
        }
    }
    log.info("Purged {} files/directories in {}", counter, path.toString());
}

From source file:org.apache.lens.server.query.TestQueryService.java

License:Apache License

/**
 * Read result set.//from  w w  w .  j a v a  2 s  . c o m
 *
 * @param resultset the resultset
 * @param handle    the handle
 * @param isDir     the is dir
 * @return the list
 * @throws IOException Signals that an I/O exception has occurred.
 */
public static List<String> readResultSet(PersistentQueryResult resultset, QueryHandle handle, boolean isDir)
        throws IOException {
    assertTrue(resultset.getPersistedURI().contains(handle.toString()));
    Path actualPath = new Path(resultset.getPersistedURI());
    FileSystem fs = actualPath.getFileSystem(new Configuration());
    List<String> actualRows = new ArrayList<>();
    if (fs.getFileStatus(actualPath).isDir()) {
        assertTrue(isDir);
        for (FileStatus fstat : fs.listStatus(actualPath)) {
            if (!fstat.isDirectory()) {
                addRowsFromFile(actualRows, fs, fstat.getPath());
            }
        }
    } else {
        assertFalse(isDir);
        addRowsFromFile(actualRows, fs, actualPath);
    }
    return actualRows;
}

From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java

License:Apache License

/** Queue "seed" documents.  Seed documents are the starting places for crawling activity.  Documents
* are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
*
* This method can choose to find repository changes that happen only during the specified time interval.
* The seeds recorded by this method will be viewed by the framework based on what the
* getConnectorModel() method returns.//from   w  ww  .jav a  2 s .c o  m
*
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
* The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
* For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
* When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
* seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
* The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
*@param seedTime is the end of the time range of documents to consider, exclusive.
*@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@return an updated seeding version string, to be stored with the job.
*/
@Override
public String addSeedDocuments(ISeedingActivity activities, Specification spec, String lastSeedVersion,
        long seedTime, int jobMode) throws ManifoldCFException, ServiceInterruption {

    String path = StringUtils.EMPTY;
    int i = 0;
    while (i < spec.getChildCount()) {
        SpecificationNode sn = spec.getChild(i);
        if (sn.getType().equals("startpoint")) {
            path = sn.getAttributeValue("path");

            FileStatus fileStatus = getObject(new Path(path));
            if (fileStatus.isDirectory()) {
                activities.addSeedDocument(fileStatus.getPath().toUri().toString());
            }
        }
        i++;
    }
    return "";
}

From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java

License:Apache License

/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
* The document specification allows this class to filter what is done based on the job.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
*@param statuses are the currently-stored document versions for each document in the set of document identifiers
* passed in above./*from w  ww . j av  a  2  s .c om*/
*@param activities is the interface this method should use to queue up new document references
* and ingest documents.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
*/
@Override
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
        IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
        throws ManifoldCFException, ServiceInterruption {

    for (String documentIdentifier : documentIdentifiers) {

        String versionString;

        FileStatus fileStatus = getObject(new Path(documentIdentifier));
        if (fileStatus != null) {

            boolean isDirectory = fileStatus.isDirectory();

            if (isDirectory) {
                // If HDFS directory modify dates are transitive, as they are on Unix,
                // then getting the modify date of the current version is sufficient
                // to detect any downstream changes we need to be aware of.
                // (If this turns out to be a bad assumption, this should simply set rval[i] ="").
                long lastModified = fileStatus.getModificationTime();
                versionString = new Long(lastModified).toString();

                if (activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) {
                    // Process directory!
                    String entityReference = documentIdentifier;
                    FileStatus[] fileStatuses = getChildren(fileStatus.getPath());
                    if (fileStatuses == null) {
                        continue;
                    }
                    for (int j = 0; j < fileStatuses.length; j++) {
                        FileStatus fs = fileStatuses[j++];
                        String canonicalPath = fs.getPath().toString();
                        if (checkInclude(session.getUri().toString(), fs, canonicalPath, spec)) {
                            activities.addDocumentReference(canonicalPath, documentIdentifier,
                                    RELATIONSHIP_CHILD);
                        }
                    }
                }
            } else {
                long lastModified = fileStatus.getModificationTime();
                StringBuilder sb = new StringBuilder();
                // Check if the path is to be converted.  We record that info in the version string so that we'll reindex documents whose
                // URI's change.
                String nameNode = nameNodeProtocol + "://" + nameNodeHost + ":" + nameNodePort;
                String convertPath = findConvertPath(nameNode, spec, fileStatus.getPath());
                if (convertPath != null) {
                    // Record the path.
                    sb.append("+");
                    pack(sb, convertPath, '+');
                } else
                    sb.append("-");
                sb.append(new Long(lastModified).toString());
                versionString = sb.toString();
                // We will record document fetch as an activity
                long startTime = System.currentTimeMillis();
                String errorCode = null;
                String errorDesc = null;
                long fileSize = 0;

                if (activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) {
                    // Process file!
                    if (!checkIngest(session.getUri().toString(), fileStatus, spec)) {
                        activities.noDocument(documentIdentifier, versionString);
                        continue;
                    }

                    // It is a file to be indexed.
                    long fileLength = fileStatus.getLen();
                    String fileName = fileStatus.getPath().getName();
                    String mimeType = mapExtensionToMimeType(fileStatus.getPath().getName());
                    Date modifiedDate = new Date(fileStatus.getModificationTime());
                    try {
                        String uri;
                        if (convertPath != null) {
                            uri = convertToWGETURI(convertPath);
                        } else {
                            uri = fileStatus.getPath().toUri().toString();
                        }

                        if (!activities.checkLengthIndexable(fileLength)) {
                            errorCode = activities.EXCLUDED_LENGTH;
                            errorDesc = "Excluding document because of file length ('" + fileLength + "')";
                            activities.noDocument(documentIdentifier, versionString);
                            continue;
                        }

                        if (!activities.checkURLIndexable(uri)) {
                            errorCode = activities.EXCLUDED_URL;
                            errorDesc = "Excluding document because of URL ('" + uri + "')";
                            activities.noDocument(documentIdentifier, versionString);
                            continue;
                        }

                        if (!activities.checkMimeTypeIndexable(mimeType)) {
                            errorCode = activities.EXCLUDED_MIMETYPE;
                            errorDesc = "Excluding document because of mime type (" + mimeType + ")";
                            activities.noDocument(documentIdentifier, versionString);
                            continue;
                        }

                        if (!activities.checkDateIndexable(modifiedDate)) {
                            errorCode = activities.EXCLUDED_DATE;
                            errorDesc = "Excluding document because of date (" + modifiedDate + ")";
                            activities.noDocument(documentIdentifier, versionString);
                            continue;
                        }

                        // Prepare the metadata part of RepositoryDocument
                        RepositoryDocument data = new RepositoryDocument();

                        data.setFileName(fileName);
                        data.setMimeType(mimeType);
                        data.setModifiedDate(modifiedDate);

                        data.addField("uri", uri);

                        BackgroundStreamThread t = new BackgroundStreamThread(getSession(),
                                new Path(documentIdentifier));
                        try {
                            t.start();
                            boolean wasInterrupted = false;
                            try {
                                InputStream is = t.getSafeInputStream();
                                try {
                                    data.setBinary(is, fileSize);
                                    activities.ingestDocumentWithException(documentIdentifier, versionString,
                                            uri, data);
                                } finally {
                                    is.close();
                                }
                            } catch (java.net.SocketTimeoutException e) {
                                throw e;
                            } catch (InterruptedIOException e) {
                                wasInterrupted = true;
                                throw e;
                            } catch (ManifoldCFException e) {
                                if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
                                    wasInterrupted = true;
                                }
                                throw e;
                            } finally {
                                if (!wasInterrupted) {
                                    // This does a join
                                    t.finishUp();
                                }
                            }

                            // No errors.  Record the fact that we made it.
                            errorCode = "OK";
                            // Length we did in bytes
                            fileSize = fileStatus.getLen();

                        } catch (InterruptedException e) {
                            // We were interrupted out of the join, most likely.  Before we abandon the thread,
                            // send a courtesy interrupt.
                            t.interrupt();
                            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                                    ManifoldCFException.INTERRUPTED);
                        } catch (java.net.SocketTimeoutException e) {
                            errorCode = "IOERROR";
                            errorDesc = e.getMessage();
                            handleIOException(e);
                        } catch (InterruptedIOException e) {
                            t.interrupt();
                            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                                    ManifoldCFException.INTERRUPTED);
                        } catch (IOException e) {
                            errorCode = "IOERROR";
                            errorDesc = e.getMessage();
                            handleIOException(e);
                        }
                    } finally {
                        if (errorCode != null) {
                            activities.recordActivity(new Long(startTime), ACTIVITY_READ, new Long(fileSize),
                                    documentIdentifier, errorCode, errorDesc, null);
                        }
                    }
                }
            }
        } else {
            activities.deleteDocument(documentIdentifier);
            continue;
        }
    }
}

From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java

License:Apache License

/** Check if a file or directory should be included, given a document specification.
 *@param fileName is the canonical file name.
 *@param documentSpecification is the specification.
 *@return true if it should be included.
 *//*from   w  w w.j  av a  2  s  .  c  om*/
protected static boolean checkInclude(String nameNode, FileStatus fileStatus, String fileName,
        Specification documentSpecification) throws ManifoldCFException {
    if (Logging.connectors.isDebugEnabled()) {
        Logging.connectors.debug("Checking whether to include file '" + fileName + "'");
    }

    String pathPart;
    String filePart;
    if (fileStatus.isDirectory()) {
        pathPart = fileName;
        filePart = null;
    } else {
        pathPart = fileStatus.getPath().getParent().toString();
        filePart = fileStatus.getPath().getName();
    }

    // Scan until we match a startpoint
    int i = 0;
    while (i < documentSpecification.getChildCount()) {
        SpecificationNode sn = documentSpecification.getChild(i++);
        if (sn.getType().equals("startpoint")) {
            String path = null;
            try {
                path = new URI(nameNode).resolve(sn.getAttributeValue("path")).toString();
            } catch (URISyntaxException e) {
                e.printStackTrace();
            }
            if (Logging.connectors.isDebugEnabled()) {
                Logging.connectors.debug("Checking path '" + path + "' against canonical '" + pathPart + "'");
            }
            // Compare with filename
            int matchEnd = matchSubPath(path, pathPart);
            if (matchEnd == -1) {
                if (Logging.connectors.isDebugEnabled()) {
                    Logging.connectors
                            .debug("Match check '" + path + "' against canonical '" + pathPart + "' failed");
                }

                continue;
            }
            // matchEnd is the start of the rest of the path (after the match) in fileName.
            // We need to walk through the rules and see whether it's in or out.
            int j = 0;
            while (j < sn.getChildCount()) {
                SpecificationNode node = sn.getChild(j++);
                String flavor = node.getType();
                String match = node.getAttributeValue("match");
                String type = node.getAttributeValue("type");
                // If type is "file", then our match string is against the filePart.
                // If filePart is null, then this rule is simply skipped.
                String sourceMatch;
                int sourceIndex;
                if (type.equals("file")) {
                    if (filePart == null) {
                        continue;
                    }
                    sourceMatch = filePart;
                    sourceIndex = 0;
                } else {
                    if (filePart != null) {
                        continue;
                    }
                    sourceMatch = pathPart;
                    sourceIndex = matchEnd;
                }

                if (flavor.equals("include")) {
                    if (checkMatch(sourceMatch, sourceIndex, match)) {
                        return true;
                    }
                } else if (flavor.equals("exclude")) {
                    if (checkMatch(sourceMatch, sourceIndex, match)) {
                        return false;
                    }
                }
            }
        }
    }
    if (Logging.connectors.isDebugEnabled()) {
        Logging.connectors.debug("Not including '" + fileName + "' because no matching rules");
    }

    return false;
}

From source file:org.apache.nifi.processors.hadoop.GetHDFS.java

License:Apache License

/**
 * Poll HDFS for files to process that match the configured file filters.
 *
 * @param hdfs hdfs// w  ww.  j  a v a2  s. c o  m
 * @param dir dir
 * @param filesVisited filesVisited
 * @return files to process
 * @throws java.io.IOException ex
 */
protected Set<Path> selectFiles(final FileSystem hdfs, final Path dir, Set<Path> filesVisited)
        throws IOException, InterruptedException {
    if (null == filesVisited) {
        filesVisited = new HashSet<>();
    }

    if (!hdfs.exists(dir)) {
        throw new IOException("Selection directory " + dir.toString() + " doesn't appear to exist!");
    }

    final Set<Path> files = new HashSet<>();

    FileStatus[] fileStatuses = getUserGroupInformation()
            .doAs((PrivilegedExceptionAction<FileStatus[]>) () -> hdfs.listStatus(dir));
    for (final FileStatus file : fileStatuses) {
        if (files.size() >= MAX_WORKING_QUEUE_SIZE) {
            // no need to make the files set larger than what we would queue anyway
            break;
        }

        final Path canonicalFile = file.getPath();

        if (!filesVisited.add(canonicalFile)) { // skip files we've already seen (may be looping directory links)
            continue;
        }

        if (file.isDirectory() && processorConfig.getRecurseSubdirs()) {
            files.addAll(selectFiles(hdfs, canonicalFile, filesVisited));

        } else if (!file.isDirectory() && processorConfig.getPathFilter(dir).accept(canonicalFile)) {
            final long fileAge = System.currentTimeMillis() - file.getModificationTime();
            if (processorConfig.getMinimumAge() < fileAge && fileAge < processorConfig.getMaximumAge()) {
                files.add(canonicalFile);

                if (getLogger().isDebugEnabled()) {
                    getLogger().debug(this + " selected file at path: " + canonicalFile.toString());
                }

            }
        }
    }
    return files;
}