List of usage examples for org.apache.hadoop.fs FileStatus getPath
public Path getPath()
From source file:com.hdfs.concat.clean.Clean.java
License:Apache License
public int cleanup(Path p) { try {/*from w ww . j a v a2s. c om*/ if (fs.isFile(p)) { if (conf.get(TARGET_EXPR) != null) { if (p.getName().matches(conf.get(TARGET_EXPR))) { warnOrDelete(p); } } if (conf.get(CUTTOFF_MILLIS) != null) { if (fs.getFileStatus(p).getModificationTime() < cutoff) { warnOrDelete(p); } } } if (fs.isDirectory(p)) { for (FileStatus stat : fs.listStatus(p)) { cleanup(stat.getPath()); } if (fs.listStatus(p).length == 0) { if (conf.get(TARGET_EXPR) != null) { if (p.getName().matches(conf.get(TARGET_EXPR))) { warnOrDelete(p); } } if (conf.get(CUTTOFF_MILLIS) != null) { if (fs.getFileStatus(p).getModificationTime() < cutoff) { warnOrDelete(p); } } } } } catch (IOException e) { System.out.println("exception " + e); return 7; } return 0; }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
private void standAlone() throws IOException { String absSrcDir = fs.makeQualified(srcDir).toUri().getPath(); String absOutDir = fs.makeQualified(outDir).toUri().getPath(); Text bucket = new Text(absSrcDir + "-0"); List<Text> files = new ArrayList<Text>(); FileStatus[] contents = fs.listStatus(new Path(absSrcDir)); for (FileStatus content : contents) { if (!content.isDir()) { if (ignoredFiles != null) { // Check for files to skip ignoredFiles.reset(content.getPath().toUri().getPath()); if (ignoredFiles.matches()) { LOG.trace("Ignoring " + content.getPath().toString()); continue; }/* w ww .j a v a 2 s.co m*/ } files.add(new Text(content.getPath().toUri().getPath())); } } /* * Is the directory empty? */ if (files.isEmpty()) { return; } /* * We trick the reducer into doing some work for us by setting these configuration properties. */ job.set("mapred.tip.id", "task_000000000000_00000_r_000000"); job.set("mapred.task.id", "attempt_000000000000_0000_r_000000_0"); job.set("mapred.output.dir", absOutDir); /* * File output committer needs this. */ fs.mkdirs(new Path(absOutDir, "_temporary")); CrushReducer reducer = new CrushReducer(); reducer.configure(job); reducer.reduce(bucket, files.iterator(), new NullOutputCollector<Text, Text>(), Reporter.NULL); reducer.close(); /* * Use a glob here because the temporary and task attempt work dirs have funny names. * Include a * at the end to cover wildcards for compressed files. */ Path crushOutput = new Path(absOutDir + "/*/*/crush" + absSrcDir + "/" + dest.getName() + "*"); FileStatus[] statuses = fs.globStatus(crushOutput); if (statuses == null || 1 != statuses.length) { throw new AssertionError("Did not find the expected output in " + crushOutput.toString()); } rename(statuses[0].getPath(), dest.getParent(), dest.getName()); }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
private void cloneOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); /*//from w w w . j a v a 2s . c o m * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is * used in the subsequent iterations. */ List<Path> crushInput = emptyList(); Text srcFile = new Text(); Text crushOut = new Text(); Text prevCrushOut = new Text(); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { if (!crushOut.equals(prevCrushOut)) { swap(crushInput, prevCrushOut.toString()); prevCrushOut.set(crushOut); crushInput = new LinkedList<Path>(); } crushInput.add(new Path(srcFile.toString())); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } swap(crushInput, prevCrushOut.toString()); } }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
/** * Moves the skipped files to the output directory. Called when operation in normal (non-clone) mode. *///from w w w . j a v a 2 s . c o m private void moveOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); Text srcFile = new Text(); Text crushOut = new Text(); Set<String> crushOutputFiles = new HashSet<String>(nBuckets); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { crushOutputFiles.add(new Path(crushOut.toString()).toUri().getPath()); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } } assert crushOutputFiles.size() == nBuckets; /* * The crushoutput files will appear in a subdirectory of the output directory. The subdirectory will be the full path of the * input directory that was crushed. E.g. * * Crush input: * /user/me/input/dir1/file1 * /user/me/input/dir1/file2 * /user/me/input/dir2/file3 * /user/me/input/dir2/file4 * /user/me/input/dir3/dir4/file5 * /user/me/input/dir3/dir4/file6 * * Crush output: * /user/me/output/user/me/input/dir1/crushed_file ... * /user/me/output/user/me/input/dir2/crushed_file ... * /user/me/output/user/me/input/dir2/dir3/dir4/crushed_file ... * * We need to collapse this down to: * /user/me/output/dir1/crushed_file ... * /user/me/output/dir2/crushed_file ... * /user/me/output/dir2/dir3/dir4/crushed_file ... */ String srcDirName = fs.makeQualified(srcDir).toUri().getPath(); String destName = fs.makeQualified(dest).toUri().getPath(); String partToReplace = fs.makeQualified(outDir).toUri().getPath() + "/crush" + srcDirName; print(Verbosity.INFO, "\n\nCopying crush files to " + destName); for (String crushOutputFile : crushOutputFiles) { Path srcPath = new Path(crushOutputFile); Path destPath = new Path(destName + crushOutputFile.substring(partToReplace.length())).getParent(); rename(srcPath, destPath, null); } print(Verbosity.INFO, "\n\nMoving skipped files to " + destName); /* * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in * the input dir, the difference being there are fewer files in the output dir. */ for (String name : skippedFiles) { Path srcPath = new Path(name); Path destPath = new Path(destName + name.substring(srcDirName.length())).getParent(); rename(srcPath, destPath, null); } }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath()); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); /*/*from w w w .j a v a 2 s . c o m*/ * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks")); Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); print(Verbosity.INFO, "\n\n" + dir.toUri().getPath()); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFiles == null) return true; ignoredFiles.reset(testPath.toUri().getPath()); return !ignoredFiles.matches(); } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, " is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { boolean changed = uncrushedFiles.add(path.toUri().getPath()); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (fileLength <= maxEligibleSize) { crushables.add(content); crushableBytes += fileLength; } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, " has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, " => " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> bucketFiles = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), bucketFiles.size())); key.set(bucketId); for (String f : bucketFiles) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size()); partitionBucketer.add(crushFile); } } } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + bucketFiles, e); } } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= numPartitions; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); try { for (Bucket partition : partitions) { String partitionName = partition.name(); partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1))); for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } DataOutputStream countersStream = fs.create(this.counters); try { jobCounters.write(countersStream); } finally { try { countersStream.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } }
From source file:com.hdfstoftp.main.HdfsToFtp.java
/** * ?//from w w w . j av a 2s . c om * * @param srcFS * * @param src * ? * @param dst * * @param queryStr * * @param deleteSource * ?? * @param overwrite * ???? * @return boolean * @throws Exception */ private static boolean copyFromHDFSToFTP(Config config) throws Exception { // ?hdfs Configuration conf = new Configuration(); FileSystem srcFS = FileSystem.get(conf); long start = System.currentTimeMillis(); boolean isRename = config.isRenameUploaded(); int retryTimes = config.getRetryTimes(); // ? String dstPath = config.getDestDir(); Path src = new Path(config.getSouceDir()); FileStatus fileStatus = srcFS.getFileStatus(src); String subDir = null; if (fileStatus.isDirectory()) {// if (isRename) {// ??rename subDir = Config.RENAME_DIR; srcFS.mkdirs(new Path(fileStatus.getPath(), subDir)); } int threadNum = config.getThreadNum(); // ExecutorService threadPool = Executors.newFixedThreadPool(threadNum); // ?ftp FTPClientPool ftpPool = new FTPClientPool(threadNum, new FtpClientFactory(config.getFTPClientConfig())); FTPClient ftpClient = ftpPool.borrowObject(); // ? ftpClient.makeDirectory(dstPath); ftpPool.returnObject(ftpClient); // ?? FileStatus contents[] = srcFS.listStatus(src); long beginFilter = 0; long endFileter = 0; if (config.getCommandLine().hasOption("d") || config.getCommandLine().hasOption("h") || config.getCommandLine().hasOption("t")) {// ?"[" beginFilter = System.currentTimeMillis(); Long[] timeRange = parseTimeRange(config.getCommandLine()); contents = getNewContents(timeRange, contents); endFileter = System.currentTimeMillis(); } // ? if (config.getCommandLine().hasOption("r")) {// "["?? beginFilter = System.currentTimeMillis(); contents = getFilterContents(config.getCommandLine().getOptionValue("r").trim(), contents); endFileter = System.currentTimeMillis(); } logger.info("total file count:" + contents.length); Map<String, String> fileNameMap = null; long beginSkip = 0; long endSkip = 0; boolean overwrite = true; if (config.getCommandLine().hasOption("o")) { overwrite = "true".equals(config.getCommandLine().getOptionValue("o").trim()); } if (!overwrite) {// ????? beginSkip = System.currentTimeMillis(); fileNameMap = getFileNameMap(dstPath, ftpPool); endSkip = System.currentTimeMillis(); } int skiped = 0; List<Future<?>> futureList = new ArrayList<Future<?>>(); for (int i = 0; i < contents.length; i++) { if (!overwrite && fileNameMap.containsKey(contents[i].getPath().getName())) { // skiped++; Log.info("skiped filename:" + contents[i].getPath().getName()); continue; } if (contents[i].isDirectory()) { continue; } // ??? Future<?> future = threadPool.submit(new UploadFileTask(srcFS, contents[i].getPath(), new Path(dstPath, contents[i].getPath().getName()), ftpPool, false, isRename, subDir, retryTimes)); futureList.add(future); } int transfered = 0; int failed = 0; for (Future<?> future : futureList) { Boolean computeResult = (Boolean) future.get(); if (computeResult) { transfered++; if (transfered % 50 == 0 || transfered == contents.length) { logger.info("have transfered:" + transfered + " files"); } } else { failed++; logger.error("failed transter:" + failed + " files"); } } // threadPool.shutdown(); // FTPCient ftpPool.close(); // **************** logger.info("filter time:" + (endFileter - beginFilter) + " ms"); if (!overwrite) { logger.info("skip time:" + (endSkip - beginSkip) + " ms"); } logger.info("total file count:" + contents.length); logger.info("total transtered: " + transfered + ",total failed:" + failed + ",total skiped:" + skiped); } else {// BufferedReader reader = null; FtpClientFactory facotry = new FtpClientFactory(config.getFTPClientConfig()); FTPClient ftpClient = null; InputStream in = null; try { Path path = fileStatus.getPath(); if (!path.getName().contains("log")) { } reader = new BufferedReader(new FileReader(new File(path.toUri().getPath()))); String str = null; ftpClient = facotry.makeObject(); while ((str = reader.readLine()) != null) { String[] feilds = str.split("&"); Path filePath = null; if (feilds.length == 2 && feilds[1] != "") { filePath = new Path(feilds[1]); in = srcFS.open(filePath); boolean result = ftpClient.storeFile(dstPath, in); System.out.println(ftpClient.getReplyCode()); if (result) { logger.info(filePath.toString()); } else { logger_failed.info(filePath.toString()); } } else { continue; } } } catch (Exception e) { e.printStackTrace(); } finally { in.close(); reader.close(); facotry.destroyObject(ftpClient); } } long end = System.currentTimeMillis(); logger.info("finished transfer,total time:" + (end - start) / 1000 + "s"); return true; }
From source file:com.hdfstoftp.main.HdfsToFtp.java
/** * ??/*from w ww. ja v a 2 s . c o m*/ * * @param queryStr * @param contents * @return FileStatus[] */ public static FileStatus[] getFilterContents(String reg, FileStatus[] contents) { Pattern pattern = Pattern.compile(reg); List<FileStatus> statusList = new ArrayList<FileStatus>(); for (FileStatus status : contents) { if (!status.isDirectory()) { String fileName = status.getPath().getName(); Matcher matcher = pattern.matcher(fileName); if (matcher.matches()) { statusList.add(status); } } } return statusList.toArray(new FileStatus[statusList.size()]); }
From source file:com.hortonworks.historian.nifi.reporter.HistorianDeanReporter.java
License:Apache License
@Override public void onTrigger(ReportingContext reportingContext) { // create the Atlas client if we don't have one /*//from w ww .j a v a 2s. c om Properties props = System.getProperties(); props.setProperty("atlas.conf", "/usr/hdp/current/atlas-client/conf"); getLogger().info("***************** atlas.conf has been set to: " + props.getProperty("atlas.conf")); */ inputs = new ArrayList<Referenceable>(); outputs = new ArrayList<Referenceable>(); //EventAccess eventAccess = reportingContext.getEventAccess(); //int pageSize = reportingContext.getProperty(ACTION_PAGE_SIZE).asInteger(); lateDataRoot = reportingContext.getProperty(LATE_DATA_ROOT).getValue(); lateDataTasksPath = lateDataRoot + "/tasks"; atlasUrl = reportingContext.getProperty(ATLAS_URL).getValue(); nifiUrl = reportingContext.getProperty(NIFI_URL).getValue(); nameNodeUrl = reportingContext.getProperty(NAME_NODE_URL).getValue(); druidBrokerUrl = reportingContext.getProperty(DRUID_BROKER_HTTP_ENDPOINT).getValue(); druidOverlordUrl = reportingContext.getProperty(DRUID_OVERLORD_HTTP_ENDPOINT).getValue(); hiveServerUri = reportingContext.getProperty(HIVE_SERVER_CONNECTION_STRING).getValue(); TAG_DIMENSION_NAME = reportingContext.getProperty(HISTORIAN_TAG_DIMENSION).getValue(); //druidMetaUri = reportingContext.getProperty(DRUID_METASTORE_CONNECTION_STRING).getValue(); String[] atlasURL = { atlasUrl }; if (atlasClient == null) { getLogger().info("Creating new Atlas client for {}", new Object[] { atlasUrl }); atlasClient = new AtlasClient(atlasURL, basicAuth); } if (atlasVersion == 0.0) { atlasVersion = Double.valueOf(getAtlasVersion(atlasUrl + "/api/atlas/admin/version", basicAuth)); getLogger().info("********** Atlas Version is: " + atlasVersion); } getLogger().info("********** Number of Reports Sent: " + timesTriggered); if (timesTriggered == 0) { String hiveUsername = "hive"; String hivePassword = "hive"; try { getLogger().info("********** Establishing Connection to HDFS..."); String hdfsPath = nameNodeUrl + "/"; Configuration conf = new Configuration(); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); fs = FileSystem.get(new URI(hdfsPath), conf); //createHDFSDirectory(lateDataRoot); createHDFSDirectory(lateDataTasksPath); getLogger().info("********** Checking for Unresolved Indexing Tasks..."); FileStatus[] fileStatus = fs.listStatus(new Path(lateDataTasksPath)); for (FileStatus status : fileStatus) { if (status.isDirectory()) { String[] address = status.getPath().toString().split("/"); String currentPath = status.getPath().toString(); String currentDirName = address[address.length - 1]; String currentTaskId = currentDirName.replace("|", ":"); String ingestSpec = readHDFSFile(currentPath + "/ingestSpec"); List<String> sourceData = Arrays .asList(readHDFSFile(currentPath + "/sourceData").split(",")); getLogger().info("********** Loading Unresolved Indexing Task:" + currentTaskId); Map<String, Object> currentTaskMetaData = new HashMap<String, Object>(); currentTaskMetaData.put("ingestSpec", ingestSpec); currentTaskMetaData.put("sourceData", sourceData); deltaIndexTasks.put(currentTaskId, currentTaskMetaData); } } getLogger().info("********** Establishing Connection to Hive Server..."); Class.forName("org.apache.hive.jdbc.HiveDriver"); hiveConnection = DriverManager.getConnection(hiveServerUri, hiveUsername, hivePassword); getLogger().info("********** Create Business Taxonomy Terms..."); String termPath = "/Catalog/terms/Unassigned"; String termDefinition = "{\"name\":\"Unassigned\",\"description\":\"\"}"; createBusinessTerm(termPath, termDefinition); getLogger().info("********** Checking if data model has been created..."); /* try { atlasClient.getType(HistorianDataTypes.TAG_DIMENSION.getName()); getLogger().info("********************* Trait: " + HistorianDataTypes.TAG_DIMENSION.getName() + " is already present"); } catch (AtlasServiceException e) { getLogger().info("***************** Creating " + HistorianDataTypes.TAG_DIMENSION.getName() + " Trait..."); atlasClient.createTraitType(HistorianDataTypes.TAG_DIMENSION.getName()); }*/ String historianDataModelJSON = generateHistorianDataModel(); getLogger().info("********** Historian Data Model as JSON = " + historianDataModelJSON); //atlasClient.createType(historianDataModelJSON); getLogger().info("********** Created Types: " + atlasClient.createType(historianDataModelJSON)); updateHiveColumnClassAttributes(); } catch (AtlasServiceException e) { e.printStackTrace(); } catch (AtlasException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (SQLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } } timesTriggered++; getLogger().info( "********** Looking for Druid Datasources to expose as Hive Tables or update with new information..."); Iterator<String> resultIterator = getDruidDataSourceList().iterator(); while (resultIterator.hasNext()) { String dataSource = resultIterator.next(); dataSourceDetails.put(dataSource, getDruidDataSourceDetails(dataSource)); getLogger().info("********** Exposing Druid Data Source: " + dataSource); exposeDruidDataSourceAsHiveTable(dataSource); getLogger().info("********** Update Atlas Hive Tables and Column for Druid Data Source: " + dataSource); updateDataSourceHiveColumnAttributes(dataSource); } getLogger().info("********** Checking for Late Arriving Data..."); List<String> dataSourceExclusions = new ArrayList<String>(); List<String> deletedTasks = new ArrayList<String>(); Map<String, Object> newTasks = new HashMap<String, Object>(); for (String taskId : deltaIndexTasks.keySet()) { String status = getIndexTaskStatus(taskId); if (status.equalsIgnoreCase("SUCCESS")) { getLogger().info("********** Indexing Task " + taskId + " completed successfully, removing source data and task meta data..."); List<String> sourceDataList = (List<String>) ((Map) deltaIndexTasks.get(taskId)).get("sourceData"); Iterator<String> currentSourceObjectIterator = sourceDataList.iterator(); while (currentSourceObjectIterator.hasNext()) { String currentSourceObject = currentSourceObjectIterator.next(); deleteHDFSObject(currentSourceObject); } deleteHDFSObject(lateDataTasksPath + "/" + taskId.replace(":", "__")); deletedTasks.add(taskId); } else if (status.equalsIgnoreCase("PENDING") || status.equalsIgnoreCase("RUNNING")) { getLogger().info("********** Indexing Task " + taskId + " is currently " + status + ", excluding source data from eligibility for new indexing tasks"); List<String> sourceDataList = (List<String>) ((Map) deltaIndexTasks.get(taskId)).get("sourceData"); dataSourceExclusions.addAll(sourceDataList); } else { getLogger().info("********** Indexing Task " + taskId + " is in " + status + " state, excluding source data from eligibility for new indexing tasks"); getLogger().info( "********** Obtain task logs from Druid Overlord Console, address the problem, and then restart the task manually..."); List<String> sourceDataList = (List<String>) ((Map) deltaIndexTasks.get(taskId)).get("sourceData"); dataSourceExclusions.addAll(sourceDataList); String ingestSpec = (String) ((Map) deltaIndexTasks.get(taskId)).get("ingestSpec"); String newTaskId = createDruidIndexingTask(ingestSpec); renameHDFSObject(lateDataTasksPath + "/" + taskId.replace(":", "__"), lateDataTasksPath + "/" + newTaskId.replace(":", "__")); newTasks.put(newTaskId, ((Map) deltaIndexTasks.get(taskId))); deletedTasks.add(taskId); } } deltaIndexTasks.putAll(newTasks); deltaIndexTasks.keySet().removeAll(deletedTasks); indexLateData(dataSourceExclusions); getLogger().info("********** Done..."); }
From source file:com.hortonworks.historian.nifi.reporter.HistorianDeanReporter.java
License:Apache License
private void indexLateData(List<String> dataSourceExclusions) { String nifiControllersUrl = nifiUrl + "/nifi-api/flow/process-groups/root/controller-services"; try {//from ww w .j av a 2 s . co m JSONArray controllers = getJSONFromUrl(nifiControllersUrl, basicAuth) .getJSONArray("controllerServices"); getLogger().info("********** Getting List of Druid Tranquility Controllers..."); for (int i = 0; i < controllers.length(); i++) { JSONObject currentController = controllers.getJSONObject(i).getJSONObject("component"); String currentControllerType = currentController.getString("type"); if (currentControllerType .equalsIgnoreCase("com.hortonworks.nifi.controller.DruidTranquilityController")) { String lateDataPath = lateDataRoot + "/" + currentController.getJSONObject("properties") .getString("query_granularity").toLowerCase() + "/"; getLogger().info("********** Checking for Late Arriving Data at HDFS Path: " + lateDataPath); if (fs.exists(new Path(lateDataPath))) { FileStatus[] fileStatus = fs.listStatus(new Path(lateDataPath)); List<Date> dates = new ArrayList<Date>(); List<String> sourceData = new ArrayList<String>(); for (FileStatus status : fileStatus) { String[] address = status.getPath().toString().split("/"); String currentBin = address[address.length - 1]; Date binDate = new SimpleDateFormat("yyyy-MM-dd-HH-mm").parse(currentBin); sourceData.add(lateDataPath + currentBin); dates.add(binDate); } ((Collection<?>) sourceData).removeAll(dataSourceExclusions); getLogger().info("********** Detected " + sourceData.size() + " bins of relevant late data, initiating Delta Indexing task..."); if (fileStatus.length > 0 && sourceData.size() > 0) { String intervalStart = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") .format(Collections.min(dates)); String intervalEnd = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") .format(Collections.max(dates)); String bins = String.join(",", sourceData); JSONArray dimensionsList = new JSONArray(Arrays.asList(currentController .getJSONObject("properties").getString("dimensions_list").split(","))); String ingestSpec = "{" + " \"type\" : \"index_hadoop\"," + " \"spec\" : {" + " \"dataSchema\" : {" + " \"dataSource\": \"" + currentController.getJSONObject("properties").getString("data_source") + "\"," + " \"parser\" : {" + " \"type\" : \"hadoopyString\"," + " \"parseSpec\" : {" + " \"format\" : \"json\"," + " \"timestampSpec\" : {" + " \"column\" : \"" + currentController.getJSONObject("properties").getString("timestamp_field") + "\"," + " \"format\" : \"auto\"" + " }," + " \"dimensionsSpec\" : {" + " \"dimensions\": " + dimensionsList + " }" + " }" + " }," + " \"metricsSpec\" : " + currentController.getJSONObject("properties") .getString("aggregators_descriptor") + "," + " \"granularitySpec\" : {" + " \"type\" : \"uniform\"," + " \"segmentGranularity\" : \"" + currentController.getJSONObject("properties").getString("segment_granularity") + "\"," + " \"queryGranularity\" : \"" + currentController.getJSONObject("properties").getString("query_granularity") + "\"," + " \"intervals\": [\"" + intervalStart + "/" + intervalEnd + "\"]" + " }" + " }," + " \"ioConfig\" : {" + " \"type\" : \"hadoop\"," + " \"inputSpec\" : {" + " \"type\" : \"multi\"," + " \"children\": [" + " {" + " \"type\" : \"dataSource\"," + " \"ingestionSpec\" : {" + " \"dataSource\": \"" + currentController.getJSONObject("properties").getString("data_source") + "\"," + " \"intervals\": [\"" + intervalStart + "/" + intervalEnd + "\"]" + " }" + " }," + " {" + " \"type\" : \"static\"," + " \"paths\": \"" + bins + "\"" + " }" + " ]" + " }" + " }," + " \"tuningConfig\" : {" + " \"type\": \"hadoop\"" + " }" + " }" + "}"; getLogger().info("********** Delta Ingestion Spec: " + ingestSpec); String indexTaskId = createDruidIndexingTask(ingestSpec); getLogger().info("********** Created Indexing Task " + indexTaskId); Map<String, Object> currentTaskMetaData = new HashMap<String, Object>(); currentTaskMetaData.put("ingestSpec", ingestSpec); currentTaskMetaData.put("sourceData", sourceData); deltaIndexTasks.put(indexTaskId, currentTaskMetaData); String currentTaskDirPath = lateDataTasksPath + "/" + indexTaskId.replace(":", "__"); getLogger().info("********** Persisting Record of Task: " + currentTaskDirPath); currentTaskDirPath = createHDFSDirectory(currentTaskDirPath); writeHDFSFile(currentTaskDirPath + "/ingestSpec", ingestSpec); writeHDFSFile(currentTaskDirPath + "/sourceData", bins); } else { getLogger().info("********** " + lateDataPath + " does not contain any data..."); } } else { getLogger().info("********** There is a Druid Controller mapped to " + lateDataPath + ", however, the path does not yet exist..."); } } } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (JSONException e) { e.printStackTrace(); } }
From source file:com.ibm.bi.dml.parser.DataExpression.java
License:Open Source License
/** * //from ww w. j a v a2 s . c o m * @param filename * @return * @throws LanguageException */ public JSONObject readMetadataFile(String filename, boolean conditional) throws LanguageException { JSONObject retVal = null; boolean exists = false; FileSystem fs = null; try { fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); } catch (Exception e) { raiseValidateError("could not read the configuration file: " + e.getMessage(), false); } Path pt = new Path(filename); try { if (fs.exists(pt)) { exists = true; } } catch (Exception e) { exists = false; } boolean isDirBoolean = false; try { if (exists && fs.getFileStatus(pt).isDirectory()) isDirBoolean = true; else isDirBoolean = false; } catch (Exception e) { raiseValidateError( "error validing whether path " + pt.toString() + " is directory or not: " + e.getMessage(), conditional); } // CASE: filename is a directory -- process as a directory if (exists && isDirBoolean) { // read directory contents retVal = new JSONObject(); FileStatus[] stats = null; try { stats = fs.listStatus(pt); } catch (Exception e) { raiseValidateError("for MTD file in directory, error reading directory with MTD file " + pt.toString() + ": " + e.getMessage(), conditional); } for (FileStatus stat : stats) { Path childPath = stat.getPath(); // gives directory name if (childPath.getName().startsWith("part")) { BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(fs.open(childPath))); } catch (Exception e) { raiseValidateError("for MTD file in directory, error reading part of MTD file with path " + childPath.toString() + ": " + e.getMessage(), conditional); } JSONObject childObj = null; try { childObj = JSONHelper.parse(br); } catch (Exception e) { raiseValidateError("for MTD file in directory, error parsing part of MTD file with path " + childPath.toString() + ": " + e.getMessage(), conditional); } for (Object obj : childObj.entrySet()) { @SuppressWarnings("unchecked") Entry<Object, Object> e = (Entry<Object, Object>) obj; Object key = e.getKey(); Object val = e.getValue(); retVal.put(key, val); } } } // end for } // CASE: filename points to a file else if (exists) { BufferedReader br = null; // try reading MTD file try { br = new BufferedReader(new InputStreamReader(fs.open(pt))); } catch (Exception e) { raiseValidateError("error reading MTD file with path " + pt.toString() + ": " + e.getMessage(), conditional); } // try parsing MTD file try { retVal = JSONHelper.parse(br); } catch (Exception e) { raiseValidateError("error parsing MTD file with path " + pt.toString() + ": " + e.getMessage(), conditional); } } return retVal; }