List of usage examples for org.apache.hadoop.fs FileStatus isDirectory
public boolean isDirectory()
From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java
License:Apache License
public static int addInputDirs(String[] inputs, Job job) throws IOException { int ret = 0;//return number of added folders for (String inp : inputs) { inp = inp.trim();//from w w w . ja v a 2 s. c o m if (inp.endsWith("/*")) { inp = inp.substring(0, inp.length() - 2); FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration()); Path path = new Path(inp); if (!exists(fs, path)) { logger.warn("Path not exist:" + path.toString()); continue; } FileStatus[] fileStatuses = fs.listStatus(path); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) { hasDir = true; ret += addInputDirs(new String[] { stat.getPath().toString() }, job); } } if (fileStatuses.length > 0 && !hasDir) { ret += addInputDirs(new String[] { path.toString() }, job); } } else { logger.debug("Add input " + inp); FileInputFormat.addInputPath(job, new Path(inp)); ret++; } } return ret; }
From source file:org.apache.kylin.engine.spark.SparkUtil.java
License:Apache License
/** * Read the given path as a Java RDD; The path can have second level sub folder. * @param inputPath/*from w ww. j ava2 s. c o m*/ * @param fs * @param sc * @param keyClass * @param valueClass * @return * @throws IOException */ public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass, Class valueClass) throws IOException { List<String> inputFolders = Lists.newArrayList(); Path inputHDFSPath = new Path(inputPath); FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) { hasDir = true; inputFolders.add(stat.getPath().toString()); } } if (!hasDir) { return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass); } return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass); }
From source file:org.apache.kylin.job.hadoop.AbstractHadoopJob.java
License:Apache License
public void addInputDirs(String input, Job job) throws IOException { for (String inp : StringSplitter.split(input, ",")) { inp = inp.trim();//from w ww.j a va 2 s . com if (inp.endsWith("/*")) { inp = inp.substring(0, inp.length() - 2); FileSystem fs = FileSystem.get(job.getConfiguration()); Path path = new Path(inp); FileStatus[] fileStatuses = fs.listStatus(path); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) { hasDir = true; addInputDirs(stat.getPath().toString(), job); } } if (fileStatuses.length > 0 && !hasDir) { addInputDirs(path.toString(), job); } } else { logger.debug("Add input " + inp); FileInputFormat.addInputPath(job, new Path(inp)); } } }
From source file:org.apache.kylin.source.hive.ITHiveTezUnionAllTest.java
License:Apache License
private void testMaterializeView(boolean isDistributeBy) throws Exception { KylinConfig config = getTestConfig(); /**/*from ww w. j a v a 2s . c o m*/ * For UT debug * config.setProperty("kylin.job.use-remote-cli", "true"); */ String viewName = "test_union_all_view"; String tableName = "test_union_all_table"; HiveCmdBuilder hiveCmdBuilder = new HiveCmdBuilder(); JobEngineConfig jobConf = new JobEngineConfig(config); String storagePath = JobBuilderSupport.getJobWorkingDir(jobConf, "it-test") + "/" + tableName; StringBuilder testCmd = new StringBuilder(); testCmd.append("USE " + config.getHiveDatabaseForIntermediateTable() + ";").append("\n"); testCmd.append("SET hive.execution.engine=tez;"); testCmd.append("DROP VIEW IF EXISTS " + viewName + ";\n"); testCmd.append("CREATE VIEW " + viewName + " AS SELECT * FROM test_kylin_fact UNION ALL SELECT * FROM test_kylin_fact").append(";\n"); testCmd.append("DROP TABLE IF EXISTS " + tableName + ";\n"); testCmd.append("CREATE TABLE IF NOT EXISTS " + tableName + "\n"); testCmd.append("LOCATION '" + storagePath + "'\n"); testCmd.append("AS SELECT * FROM " + viewName + "\n"); if (isDistributeBy) hiveCmdBuilder.addStatementWithRedistributeBy(testCmd); else hiveCmdBuilder.addStatement(testCmd.toString()); Path rootPath = new Path(storagePath); FileSystem fs = HadoopUtil.getFileSystem(storagePath); fs.delete(rootPath, true); fs.mkdirs(rootPath); config.getCliCommandExecutor().execute(hiveCmdBuilder.build()); rootPath = fs.makeQualified(rootPath); for (FileStatus statsFolder : fs.listStatus(rootPath)) { if (isDistributeBy) Assert.assertTrue(!statsFolder.isDirectory()); else Assert.assertTrue(statsFolder.isDirectory()); } HiveCmdBuilder cleanupCmdBuilder = new HiveCmdBuilder(); StringBuilder cleanupCmd = new StringBuilder(); cleanupCmd.append("USE " + config.getHiveDatabaseForIntermediateTable() + ";").append("\n"); cleanupCmd.append("DROP VIEW IF EXISTS " + viewName + ";\n"); cleanupCmd.append("DROP TABLE IF EXISTS " + tableName + ";\n"); cleanupCmdBuilder.addStatement(cleanupCmd.toString()); config.getCliCommandExecutor().execute(cleanupCmdBuilder.build()); fs.delete(rootPath, true); }
From source file:org.apache.lens.server.query.QueryResultPurger.java
License:Apache License
public void purgePaths(Path path, DateUtil.TimeDiff retention, boolean purgeDirectory) throws IOException { int counter = 0; FileSystem fs = path.getFileSystem(conf); FileStatus[] fileList = fs.listStatus(path); for (FileStatus f : fileList) { if ((f.isFile() || (f.isDirectory() && purgeDirectory)) && canBePurged(f, retention)) { try { if (fs.delete(f.getPath(), true)) { counter++;/*from ww w . j ava 2s .c o m*/ } else { getMetrics().incrCounter(this.getClass(), QUERY_RESULT_PURGER_ERROR_COUNTER); } } catch (IOException e) { getMetrics().incrCounter(this.getClass(), QUERY_RESULT_PURGER_ERROR_COUNTER); } } } log.info("Purged {} files/directories in {}", counter, path.toString()); }
From source file:org.apache.lens.server.query.TestQueryService.java
License:Apache License
/** * Read result set.//from w w w . j a v a 2 s . c o m * * @param resultset the resultset * @param handle the handle * @param isDir the is dir * @return the list * @throws IOException Signals that an I/O exception has occurred. */ public static List<String> readResultSet(PersistentQueryResult resultset, QueryHandle handle, boolean isDir) throws IOException { assertTrue(resultset.getPersistedURI().contains(handle.toString())); Path actualPath = new Path(resultset.getPersistedURI()); FileSystem fs = actualPath.getFileSystem(new Configuration()); List<String> actualRows = new ArrayList<>(); if (fs.getFileStatus(actualPath).isDir()) { assertTrue(isDir); for (FileStatus fstat : fs.listStatus(actualPath)) { if (!fstat.isDirectory()) { addRowsFromFile(actualRows, fs, fstat.getPath()); } } } else { assertFalse(isDir); addRowsFromFile(actualRows, fs, actualPath); } return actualRows; }
From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java
License:Apache License
/** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object. * * This method can choose to find repository changes that happen only during the specified time interval. * The seeds recorded by this method will be viewed by the framework based on what the * getConnectorModel() method returns.//from w ww .jav a 2 s .c o m * * It is not a big problem if the connector chooses to create more seeds than are * strictly necessary; it is merely a question of overall work required. * * The end time and seeding version string passed to this method may be interpreted for greatest efficiency. * For continuous crawling jobs, this method will * be called once, when the job starts, and at various periodic intervals as the job executes. * * When a job's specification is changed, the framework automatically resets the seeding version string to null. The * seeding version string may also be set to null on each job run, depending on the connector model returned by * getConnectorModel(). * * Note that it is always ok to send MORE documents rather than less to this method. * The connector will be connected before this method can be called. *@param activities is the interface this method should use to perform whatever framework actions are desired. *@param spec is a document specification (that comes from the job). *@param seedTime is the end of the time range of documents to consider, exclusive. *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string. *@param jobMode is an integer describing how the job is being run, whether continuous or once-only. *@return an updated seeding version string, to be stored with the job. */ @Override public String addSeedDocuments(ISeedingActivity activities, Specification spec, String lastSeedVersion, long seedTime, int jobMode) throws ManifoldCFException, ServiceInterruption { String path = StringUtils.EMPTY; int i = 0; while (i < spec.getChildCount()) { SpecificationNode sn = spec.getChild(i); if (sn.getType().equals("startpoint")) { path = sn.getAttributeValue("path"); FileStatus fileStatus = getObject(new Path(path)); if (fileStatus.isDirectory()) { activities.addSeedDocument(fileStatus.getPath().toUri().toString()); } } i++; } return ""; }
From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java
License:Apache License
/** Process a set of documents. * This is the method that should cause each document to be fetched, processed, and the results either added * to the queue of documents for the current job, and/or entered into the incremental ingestion manager. * The document specification allows this class to filter what is done based on the job. * The connector will be connected before this method can be called. *@param documentIdentifiers is the set of document identifiers to process. *@param statuses are the currently-stored document versions for each document in the set of document identifiers * passed in above./*from w ww . j av a 2 s .c om*/ *@param activities is the interface this method should use to queue up new document references * and ingest documents. *@param jobMode is an integer describing how the job is being run, whether continuous or once-only. *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one. */ @Override public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption { for (String documentIdentifier : documentIdentifiers) { String versionString; FileStatus fileStatus = getObject(new Path(documentIdentifier)); if (fileStatus != null) { boolean isDirectory = fileStatus.isDirectory(); if (isDirectory) { // If HDFS directory modify dates are transitive, as they are on Unix, // then getting the modify date of the current version is sufficient // to detect any downstream changes we need to be aware of. // (If this turns out to be a bad assumption, this should simply set rval[i] =""). long lastModified = fileStatus.getModificationTime(); versionString = new Long(lastModified).toString(); if (activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) { // Process directory! String entityReference = documentIdentifier; FileStatus[] fileStatuses = getChildren(fileStatus.getPath()); if (fileStatuses == null) { continue; } for (int j = 0; j < fileStatuses.length; j++) { FileStatus fs = fileStatuses[j++]; String canonicalPath = fs.getPath().toString(); if (checkInclude(session.getUri().toString(), fs, canonicalPath, spec)) { activities.addDocumentReference(canonicalPath, documentIdentifier, RELATIONSHIP_CHILD); } } } } else { long lastModified = fileStatus.getModificationTime(); StringBuilder sb = new StringBuilder(); // Check if the path is to be converted. We record that info in the version string so that we'll reindex documents whose // URI's change. String nameNode = nameNodeProtocol + "://" + nameNodeHost + ":" + nameNodePort; String convertPath = findConvertPath(nameNode, spec, fileStatus.getPath()); if (convertPath != null) { // Record the path. sb.append("+"); pack(sb, convertPath, '+'); } else sb.append("-"); sb.append(new Long(lastModified).toString()); versionString = sb.toString(); // We will record document fetch as an activity long startTime = System.currentTimeMillis(); String errorCode = null; String errorDesc = null; long fileSize = 0; if (activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) { // Process file! if (!checkIngest(session.getUri().toString(), fileStatus, spec)) { activities.noDocument(documentIdentifier, versionString); continue; } // It is a file to be indexed. long fileLength = fileStatus.getLen(); String fileName = fileStatus.getPath().getName(); String mimeType = mapExtensionToMimeType(fileStatus.getPath().getName()); Date modifiedDate = new Date(fileStatus.getModificationTime()); try { String uri; if (convertPath != null) { uri = convertToWGETURI(convertPath); } else { uri = fileStatus.getPath().toUri().toString(); } if (!activities.checkLengthIndexable(fileLength)) { errorCode = activities.EXCLUDED_LENGTH; errorDesc = "Excluding document because of file length ('" + fileLength + "')"; activities.noDocument(documentIdentifier, versionString); continue; } if (!activities.checkURLIndexable(uri)) { errorCode = activities.EXCLUDED_URL; errorDesc = "Excluding document because of URL ('" + uri + "')"; activities.noDocument(documentIdentifier, versionString); continue; } if (!activities.checkMimeTypeIndexable(mimeType)) { errorCode = activities.EXCLUDED_MIMETYPE; errorDesc = "Excluding document because of mime type (" + mimeType + ")"; activities.noDocument(documentIdentifier, versionString); continue; } if (!activities.checkDateIndexable(modifiedDate)) { errorCode = activities.EXCLUDED_DATE; errorDesc = "Excluding document because of date (" + modifiedDate + ")"; activities.noDocument(documentIdentifier, versionString); continue; } // Prepare the metadata part of RepositoryDocument RepositoryDocument data = new RepositoryDocument(); data.setFileName(fileName); data.setMimeType(mimeType); data.setModifiedDate(modifiedDate); data.addField("uri", uri); BackgroundStreamThread t = new BackgroundStreamThread(getSession(), new Path(documentIdentifier)); try { t.start(); boolean wasInterrupted = false; try { InputStream is = t.getSafeInputStream(); try { data.setBinary(is, fileSize); activities.ingestDocumentWithException(documentIdentifier, versionString, uri, data); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { throw e; } catch (InterruptedIOException e) { wasInterrupted = true; throw e; } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) { wasInterrupted = true; } throw e; } finally { if (!wasInterrupted) { // This does a join t.finishUp(); } } // No errors. Record the fact that we made it. errorCode = "OK"; // Length we did in bytes fileSize = fileStatus.getLen(); } catch (InterruptedException e) { // We were interrupted out of the join, most likely. Before we abandon the thread, // send a courtesy interrupt. t.interrupt(); throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (java.net.SocketTimeoutException e) { errorCode = "IOERROR"; errorDesc = e.getMessage(); handleIOException(e); } catch (InterruptedIOException e) { t.interrupt(); throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { errorCode = "IOERROR"; errorDesc = e.getMessage(); handleIOException(e); } } finally { if (errorCode != null) { activities.recordActivity(new Long(startTime), ACTIVITY_READ, new Long(fileSize), documentIdentifier, errorCode, errorDesc, null); } } } } } else { activities.deleteDocument(documentIdentifier); continue; } } }
From source file:org.apache.manifoldcf.crawler.connectors.hdfs.HDFSRepositoryConnector.java
License:Apache License
/** Check if a file or directory should be included, given a document specification. *@param fileName is the canonical file name. *@param documentSpecification is the specification. *@return true if it should be included. *//*from w w w.j av a 2 s . c om*/ protected static boolean checkInclude(String nameNode, FileStatus fileStatus, String fileName, Specification documentSpecification) throws ManifoldCFException { if (Logging.connectors.isDebugEnabled()) { Logging.connectors.debug("Checking whether to include file '" + fileName + "'"); } String pathPart; String filePart; if (fileStatus.isDirectory()) { pathPart = fileName; filePart = null; } else { pathPart = fileStatus.getPath().getParent().toString(); filePart = fileStatus.getPath().getName(); } // Scan until we match a startpoint int i = 0; while (i < documentSpecification.getChildCount()) { SpecificationNode sn = documentSpecification.getChild(i++); if (sn.getType().equals("startpoint")) { String path = null; try { path = new URI(nameNode).resolve(sn.getAttributeValue("path")).toString(); } catch (URISyntaxException e) { e.printStackTrace(); } if (Logging.connectors.isDebugEnabled()) { Logging.connectors.debug("Checking path '" + path + "' against canonical '" + pathPart + "'"); } // Compare with filename int matchEnd = matchSubPath(path, pathPart); if (matchEnd == -1) { if (Logging.connectors.isDebugEnabled()) { Logging.connectors .debug("Match check '" + path + "' against canonical '" + pathPart + "' failed"); } continue; } // matchEnd is the start of the rest of the path (after the match) in fileName. // We need to walk through the rules and see whether it's in or out. int j = 0; while (j < sn.getChildCount()) { SpecificationNode node = sn.getChild(j++); String flavor = node.getType(); String match = node.getAttributeValue("match"); String type = node.getAttributeValue("type"); // If type is "file", then our match string is against the filePart. // If filePart is null, then this rule is simply skipped. String sourceMatch; int sourceIndex; if (type.equals("file")) { if (filePart == null) { continue; } sourceMatch = filePart; sourceIndex = 0; } else { if (filePart != null) { continue; } sourceMatch = pathPart; sourceIndex = matchEnd; } if (flavor.equals("include")) { if (checkMatch(sourceMatch, sourceIndex, match)) { return true; } } else if (flavor.equals("exclude")) { if (checkMatch(sourceMatch, sourceIndex, match)) { return false; } } } } } if (Logging.connectors.isDebugEnabled()) { Logging.connectors.debug("Not including '" + fileName + "' because no matching rules"); } return false; }
From source file:org.apache.nifi.processors.hadoop.GetHDFS.java
License:Apache License
/** * Poll HDFS for files to process that match the configured file filters. * * @param hdfs hdfs// w ww. j a v a2 s. c o m * @param dir dir * @param filesVisited filesVisited * @return files to process * @throws java.io.IOException ex */ protected Set<Path> selectFiles(final FileSystem hdfs, final Path dir, Set<Path> filesVisited) throws IOException, InterruptedException { if (null == filesVisited) { filesVisited = new HashSet<>(); } if (!hdfs.exists(dir)) { throw new IOException("Selection directory " + dir.toString() + " doesn't appear to exist!"); } final Set<Path> files = new HashSet<>(); FileStatus[] fileStatuses = getUserGroupInformation() .doAs((PrivilegedExceptionAction<FileStatus[]>) () -> hdfs.listStatus(dir)); for (final FileStatus file : fileStatuses) { if (files.size() >= MAX_WORKING_QUEUE_SIZE) { // no need to make the files set larger than what we would queue anyway break; } final Path canonicalFile = file.getPath(); if (!filesVisited.add(canonicalFile)) { // skip files we've already seen (may be looping directory links) continue; } if (file.isDirectory() && processorConfig.getRecurseSubdirs()) { files.addAll(selectFiles(hdfs, canonicalFile, filesVisited)); } else if (!file.isDirectory() && processorConfig.getPathFilter(dir).accept(canonicalFile)) { final long fileAge = System.currentTimeMillis() - file.getModificationTime(); if (processorConfig.getMinimumAge() < fileAge && fileAge < processorConfig.getMaximumAge()) { files.add(canonicalFile); if (getLogger().isDebugEnabled()) { getLogger().debug(this + " selected file at path: " + canonicalFile.toString()); } } } } return files; }