List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.twitter.hraven.etl.JobFilePreprocessor.java
License:Apache License
@Override public int run(String[] args) throws Exception { // When we started processing. This is also the upper limit of files we // accept, next run will pick up the new incoming files. long processingStartMillis = System.currentTimeMillis(); Configuration hbaseConf = HBaseConfiguration.create(getConf()); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Output should be an hdfs path. FileSystem hdfs = FileSystem.get(hbaseConf); // Grab the input path argument String output = commandLine.getOptionValue("o"); LOG.info(" output=" + output); Path outputPath = new Path(output); FileStatus outputFileStatus = hdfs.getFileStatus(outputPath); if (!outputFileStatus.isDir()) { throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName()); }/*from ww w. j a v a 2 s. c o m*/ // Grab the input path argument String input; if (commandLine.hasOption("i")) { input = commandLine.getOptionValue("i"); } else { input = hbaseConf.get("mapred.job.tracker.history.completed.location"); } LOG.info("input=" + input); // Grab the batch-size argument int batchSize; if (commandLine.hasOption("b")) { try { batchSize = Integer.parseInt(commandLine.getOptionValue("b")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe); } // Additional check if (batchSize < 1) { throw new IllegalArgumentException( "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + commandLine.getOptionValue("b")); } } else { batchSize = DEFAULT_BATCH_SIZE; } boolean forceAllFiles = commandLine.hasOption("f"); LOG.info("forceAllFiles: " + forceAllFiles); Path inputPath = new Path(input); FileStatus inputFileStatus = hdfs.getFileStatus(inputPath); if (!inputFileStatus.isDir()) { throw new IOException("Input is not a directory" + inputFileStatus.getPath().getName()); } // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); /** * Grab the size of huge files to be moved argument * hbase cell can't store files bigger than * maxFileSize, hence no need to consider them for rawloading * Reference: * {@link https://github.com/twitter/hraven/issues/59} */ String maxFileSizeStr = commandLine.getOptionValue("s"); LOG.info("maxFileSize=" + maxFileSizeStr); long maxFileSize = DEFAULT_RAW_FILE_SIZE_LIMIT; try { maxFileSize = Long.parseLong(maxFileSizeStr); } catch (NumberFormatException nfe) { throw new ProcessingException( "Caught NumberFormatException during conversion " + " of maxFileSize to long", nfe); } ProcessRecordService processRecordService = new ProcessRecordService(hbaseConf); boolean success = true; try { // Figure out where we last left off (if anywhere at all) ProcessRecord lastProcessRecord = null; if (!forceAllFiles) { lastProcessRecord = processRecordService.getLastSuccessfulProcessRecord(cluster); } long minModificationTimeMillis = 0; if (lastProcessRecord != null) { // Start of this time period is the end of the last period. minModificationTimeMillis = lastProcessRecord.getMaxModificationTimeMillis(); } // Do a sanity check. The end time of the last scan better not be later // than when we started processing. if (minModificationTimeMillis > processingStartMillis) { throw new RuntimeException("The last processing record has maxModificationMillis later than now: " + lastProcessRecord); } // Accept only jobFiles and only those that fall in the desired range of // modification time. JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter( hbaseConf, minModificationTimeMillis); String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(minModificationTimeMillis)); ContentSummary contentSummary = hdfs.getContentSummary(inputPath); LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath + " that are modified since " + timestamp); // get the files in the done folder, // need to traverse dirs under done recursively for versions // that include MAPREDUCE-323: on/after hadoop 0.20.203.0 // on/after cdh3u5 FileStatus[] jobFileStatusses = FileLister.getListFilesToProcess(maxFileSize, true, hdfs, inputPath, jobFileModifiedRangePathFilter); LOG.info("Sorting " + jobFileStatusses.length + " job files."); Arrays.sort(jobFileStatusses, new FileStatusModificationComparator()); // Process these files in batches at a time. int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize); LOG.info("Batch count: " + batchCount); for (int b = 0; b < batchCount; b++) { processBatch(jobFileStatusses, b, batchSize, processRecordService, cluster, outputPath); } } finally { processRecordService.close(); } Statistics statistics = FileSystem.getStatistics(inputPath.toUri().getScheme(), hdfs.getClass()); if (statistics != null) { LOG.info("HDFS bytes read: " + statistics.getBytesRead()); LOG.info("HDFS bytes written: " + statistics.getBytesWritten()); LOG.info("HDFS read ops: " + statistics.getReadOps()); LOG.info("HDFS large read ops: " + statistics.getLargeReadOps()); LOG.info("HDFS write ops: " + statistics.getWriteOps()); } // Return the status return success ? 0 : 1; }
From source file:com.twitter.hraven.etl.JobFileProcessor.java
License:Apache License
public int run(String[] args) throws Exception { Configuration hbaseConf = HBaseConfiguration.create(getConf()); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); // Number of parallel threads to use int threadCount = 1; if (commandLine.hasOption("t")) { try {/*from ww w. ja v a 2 s . c o m*/ threadCount = Integer.parseInt(commandLine.getOptionValue("t")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "Provided thread-count argument (-t) is not a number: " + commandLine.getOptionValue("t"), nfe); } if (threadCount < 1) { throw new IllegalArgumentException( "Cannot run fewer than 1 thread. Provided thread-count argument (-t): " + threadCount); } } LOG.info("threadCount=" + threadCount); boolean reprocess = commandLine.hasOption("r"); LOG.info("reprocess=" + reprocess); // Grab the batch-size argument int batchSize; if (commandLine.hasOption("b")) { try { batchSize = Integer.parseInt(commandLine.getOptionValue("b")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe); } // Additional check if (batchSize < 1) { throw new IllegalArgumentException( "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + commandLine.getOptionValue("b")); } } else { batchSize = DEFAULT_BATCH_SIZE; } // Grab the costfile argument String costFilePath = commandLine.getOptionValue("zf"); LOG.info("cost properties file on hdfs=" + costFilePath); if (costFilePath == null) costFilePath = Constants.COST_PROPERTIES_HDFS_DIR; Path hdfsPath = new Path(costFilePath + Constants.COST_PROPERTIES_FILENAME); // add to distributed cache DistributedCache.addCacheFile(hdfsPath.toUri(), hbaseConf); // Grab the machine type argument String machineType = commandLine.getOptionValue("m"); // set it as part of conf so that the // hRaven job can access it in the mapper hbaseConf.set(Constants.HRAVEN_MACHINE_TYPE, machineType); // check if re-aggregate option is forced on // if yes, we need to aggregate for this job inspite of // job having aggregation done status in raw table boolean reAggregateFlagValue = false; if (commandLine.hasOption("ra")) { String reaggregateFlag = commandLine.getOptionValue("ra"); // set it as part of conf so that the // hRaven jobProcessor can access it in the mapper if (StringUtils.isNotBlank(reaggregateFlag)) { LOG.info(" reaggregateFlag is: " + reaggregateFlag); if (StringUtils.equalsIgnoreCase(reaggregateFlag, Boolean.TRUE.toString())) { reAggregateFlagValue = true; } } } LOG.info(AggregationConstants.RE_AGGREGATION_FLAG_NAME + "=" + reAggregateFlagValue); hbaseConf.setBoolean(AggregationConstants.RE_AGGREGATION_FLAG_NAME, reAggregateFlagValue); // set aggregation to off by default boolean aggFlagValue = false; if (commandLine.hasOption("a")) { String aggregateFlag = commandLine.getOptionValue("a"); // set it as part of conf so that the // hRaven jobProcessor can access it in the mapper if (StringUtils.isNotBlank(aggregateFlag)) { LOG.info(" aggregateFlag is: " + aggregateFlag); if (StringUtils.equalsIgnoreCase(aggregateFlag, Boolean.TRUE.toString())) { aggFlagValue = true; } } } if (reprocess) { // turn off aggregation if reprocessing is true // we don't want to inadvertently aggregate again while re-processing // re-aggregation needs to be a conscious setting aggFlagValue = false; } LOG.info(AggregationConstants.AGGREGATION_FLAG_NAME + "=" + aggFlagValue); hbaseConf.setBoolean(AggregationConstants.AGGREGATION_FLAG_NAME, aggFlagValue); String processFileSubstring = null; if (commandLine.hasOption("p")) { processFileSubstring = commandLine.getOptionValue("p"); } LOG.info("processFileSubstring=" + processFileSubstring); // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have // history files exceeding that. Disable limit. hbaseConf.setInt("hbase.client.keyvalue.maxsize", 0); // Shove this into the jobConf so that we can get it out on the task side. hbaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster); boolean success = false; if (reprocess) { success = reProcessRecords(hbaseConf, cluster, batchSize, threadCount); } else { success = processRecords(hbaseConf, cluster, batchSize, threadCount, processFileSubstring); } // Return the status return success ? 0 : 1; }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
@Test public void testPruneFileListBySize() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[2]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os);//from ww w . j a v a 2 s . c o m assertTrue(hdfs.exists(inputPath)); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[0] = hdfs.getFileStatus(expPath); final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml"; File jobConfFile = new File(JOB_CONF_FILE_NAME); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[1] = hdfs.getFileStatus(expPath); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 0); Path emptyFile = new Path( inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist"); os = hdfs.createNewFile(emptyFile); assertTrue(os); assertTrue(hdfs.exists(emptyFile)); origList[0] = hdfs.getFileStatus(emptyFile); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[1] = hdfs.getFileStatus(emptyConfFile); prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 2); }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
/** * removes conf file which has already been put in prunedList * * @throws IOException/*from w w w . ja v a2 s . com*/ */ @Test public void testPruneFileListRemovingConfFromPruneList() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[2]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize_pruneList"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os); assertTrue(hdfs.exists(inputPath)); Path relocationPath = new Path("/relocation_filesize_pruneList"); os = hdfs.mkdirs(relocationPath); assertTrue(os); assertTrue(hdfs.exists(relocationPath)); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329348432655_0001_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[0] = hdfs.getFileStatus(emptyConfFile); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[1] = hdfs.getFileStatus(expPath); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 0); }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
/** * tests the case when several files are spread out in the dir and need to be removed * * @throws IOException// w w w . j a v a2 s . c o m */ @Test public void testPruneFileListMultipleFilesAlreadyMovedCases() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[12]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize_multiple"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os); assertTrue(hdfs.exists(inputPath)); Path relocationPath = new Path("/relocation_filesize_multiple"); os = hdfs.mkdirs(relocationPath); assertTrue(os); assertTrue(hdfs.exists(relocationPath)); Path emptyFile = new Path( inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist"); os = hdfs.createNewFile(emptyFile); assertTrue(os); assertTrue(hdfs.exists(emptyFile)); origList[0] = hdfs.getFileStatus(emptyFile); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[1] = hdfs.getFileStatus(emptyConfFile); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[2] = hdfs.getFileStatus(expPath); final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml"; File jobConfFile = new File(JOB_CONF_FILE_NAME); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[3] = hdfs.getFileStatus(expPath); Path inputPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221-1311111143227-user10101-WordCount-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath2); assertTrue(hdfs.exists(inputPath2)); origList[4] = hdfs.getFileStatus(inputPath2); Path inputPath3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991-1311111143227-user3321-TeraGen-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath3); assertTrue(hdfs.exists(inputPath3)); origList[5] = hdfs.getFileStatus(inputPath3); Path inputPath4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771-1311111143227-user3321-TeraSort-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath4); assertTrue(hdfs.exists(inputPath4)); origList[6] = hdfs.getFileStatus(inputPath4); Path emptyFile2 = new Path( inputPath.toUri() + "/" + "job_1329343333333_5551-1329111113227-user2-SomethingElse.jhist"); os = hdfs.createNewFile(emptyFile2); assertTrue(os); assertTrue(hdfs.exists(emptyFile2)); origList[7] = hdfs.getFileStatus(emptyFile2); Path emptyConfFile2 = new Path(inputPath.toUri() + "/" + "job_1329343333333_5551_conf.xml"); os = hdfs.createNewFile(emptyConfFile2); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile2)); origList[8] = hdfs.getFileStatus(emptyConfFile2); // this is an empty file which tests the toBeRemovedFileList // at the end of function pruneFileListBySize Path emptyConfFile3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991_conf.xml"); os = hdfs.createNewFile(emptyConfFile3); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile3)); origList[9] = hdfs.getFileStatus(emptyConfFile3); Path inputConfPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221_conf.xml"); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputConfPath2); assertTrue(hdfs.exists(inputConfPath2)); origList[10] = hdfs.getFileStatus(inputConfPath2); // this is an empty file which tests the toBeRemovedFileList // at the end of function pruneFileListBySize Path emptyConfFile4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771_conf.xml"); os = hdfs.createNewFile(emptyConfFile4); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile4)); origList[11] = hdfs.getFileStatus(emptyConfFile4); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 4); }
From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java
License:Apache License
/** * Moves all the results of a collection of MR jobs to the final * output directory. Some of the results may have been put into a * temp location to work around restrictions with multiple output * from a single map reduce job./*from www . j a v a2s . com*/ * * This method should always be called after the job execution * completes. */ public void moveResults(List<Job> completedJobs) throws IOException { for (Job job : completedJobs) { Pair<List<POStore>, Path> pair = jobStoreMap.get(job); if (pair != null && pair.second != null) { Path tmp = pair.second; Path abs = new Path(tmp, "abs"); Path rel = new Path(tmp, "rel"); FileSystem fs = tmp.getFileSystem(conf); if (fs.exists(abs)) { moveResults(abs, abs.toUri().getPath(), fs); } if (fs.exists(rel)) { moveResults(rel, rel.toUri().getPath() + "/", fs); } } } }
From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java
License:Apache License
private Path removePart(Path src, String part) { URI uri = src.toUri(); String pathStr = uri.getPath().replace(part, ""); return new Path(pathStr); }
From source file:com.uber.hoodie.common.io.storage.HoodieWrapperFileSystem.java
License:Apache License
private static Path convertPathWithScheme(Path oldPath, String newScheme) { URI oldURI = oldPath.toUri(); URI newURI;//from www . ja va2s. co m try { newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment()); return new Path(newURI); } catch (URISyntaxException e) { // TODO - Better Exception handling throw new RuntimeException(e); } }
From source file:com.uber.hoodie.common.table.timeline.dto.FilePathDTO.java
License:Apache License
public static FilePathDTO fromPath(Path path) { if (null == path) { return null; }/*from w w w . ja v a 2 s . c o m*/ FilePathDTO dto = new FilePathDTO(); dto.uri = path.toUri().toString(); return dto; }