List of usage examples for org.apache.hadoop.fs ContentSummary getLength
public long getLength()
From source file:BwaInterpreter.java
License:Open Source License
private void setTotalInputLength() { try {//from w w w . j a va 2 s .com FileSystem fs = FileSystem.get(this.conf); // To get the input files sizes ContentSummary cSummaryFile1 = fs.getContentSummary(new Path(options.getInputPath())); long lengthFile1 = cSummaryFile1.getLength(); long lengthFile2 = 0; if (!options.getInputPath2().isEmpty()) { ContentSummary cSummaryFile2 = fs.getContentSummary(new Path(options.getInputPath())); lengthFile2 = cSummaryFile2.getLength(); } // Total size. Depends on paired or single reads this.totalInputLength = lengthFile1 + lengthFile2; fs.close(); } catch (IOException e) { LOG.error(e.toString()); e.printStackTrace(); } }
From source file:BwaInterpreter.java
License:Open Source License
/** * Used to perform the sort operation in HDFS * @brief This function provides a method to perform the sort phase in HDFS * @author Jos M. Abun//from w w w. j av a 2 s.c o m * @param fileName1 The first file that contains input FASTQ reads. Stored in HDFS * @param fileName2 The second file that contains input FASTQ reads. Stored in HDFS * @return A JavaRDD that contains the paired reads sorted */ public JavaRDD<Tuple2<String, String>> SortInHDFS2(String fileName1, String fileName2) { Configuration conf = this.conf; LOG.info("JMAbuin:: Starting writing reads to HDFS"); try { FileSystem fs = FileSystem.get(conf); Path outputFilePath = new Path(this.inputTmpFileName); //To write the paired reads FSDataOutputStream outputFinalStream = fs.create(outputFilePath, true); //To read paired reads from both files BufferedReader brFastqFile1 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName1)))); BufferedReader brFastqFile2 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName2)))); String lineFastq1; String lineFastq2; lineFastq1 = brFastqFile1.readLine(); lineFastq2 = brFastqFile2.readLine(); //Loop to read two files. The two of them must have the same line numbers while (lineFastq1 != null) { //The lines are written interspersed outputFinalStream.write((lineFastq1 + "\n" + lineFastq2 + "\n").getBytes()); //Next lines are readed lineFastq1 = brFastqFile1.readLine(); lineFastq2 = brFastqFile2.readLine(); } //Close the input and output files brFastqFile1.close(); brFastqFile2.close(); outputFinalStream.close(); //Now it is time to read the previous created file and create the RDD ContentSummary cSummary = fs.getContentSummary(outputFilePath); long length = cSummary.getLength(); this.totalInputLength = length; fs.close(); //In case of the user does want partitioning if (this.options.getPartitionNumber() != 0) { //These options are set to indicate the split size and get the correct vnumber of partitions this.conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf((length) / this.options.getPartitionNumber())); this.conf.set("mapreduce.input.fileinputformat.split.minsize", String.valueOf((length) / this.options.getPartitionNumber())); LOG.info("JMAbuin partitioning from HDFS:: " + String.valueOf((length) / this.options.getPartitionNumber())); //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class, String.class, this.conf).mapPartitions(new BigFastq2RDDPartitionsDouble(), true); } else { //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class, String.class, this.conf).map(new BigFastq2RDDDouble()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); LOG.error(e.toString()); return null; } }
From source file:BigBWA.java
License:Open Source License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); for (String argumento : args) { LOG.info("Arg: " + argumento); }//from w w w . java2 s.com String inputPath = ""; String outputPath = ""; boolean useReducer = false; BwaOptions options = new BwaOptions(args); //We set the timeout and stablish the bwa library to call BWA methods conf.set("mapreduce.task.timeout", "0"); conf.set("mapreduce.map.env", "LD_LIBRARY_PATH=./bwa.zip/"); //==================Algorithm election================== //One of the algorithms is going to be in use, because tge default is always specified. if (options.isMemAlgorithm()) { //Case of the mem algorithm conf.set("mem", "true"); conf.set("aln", "false"); conf.set("bwasw", "false"); } else if (options.isAlnAlgorithm()) { // Case of aln algorithm conf.set("mem", "false"); conf.set("aln", "true"); conf.set("bwasw", "false"); } else if (options.isBwaswAlgorithm()) { // Case of bwasw algorithm conf.set("mem", "false"); conf.set("aln", "false"); conf.set("bwasw", "true"); } //==================Index election================== if (options.getIndexPath() != "") { conf.set("indexRoute", options.getIndexPath()); } else { System.err.println("No index has been found. Aborting."); System.exit(1); } //==================Type of reads election================== //There is always going to be a type of reads, because default is paired if (options.isPairedReads()) { conf.set("paired", "true"); conf.set("single", "false"); } else if (options.isSingleReads()) { conf.set("paired", "false"); conf.set("single", "true"); } //==================Use of reducer================== if (options.isUseReducer()) { useReducer = true; conf.set("useReducer", "true"); } else { conf.set("useReducer", "false"); } //==================Number of threads per map================== if (options.getNumThreads() != "0") { conf.set("bwathreads", options.getNumThreads()); } //==================RG Header=================== if (options.getReadgroupHeader() != "") { conf.set("rgheader", options.getReadgroupHeader()); } //==================Input and output paths================== inputPath = options.getInputPath(); outputPath = options.getOutputPath(); conf.set("outputGenomics", outputPath); //==================Partition number================== if (options.getPartitionNumber() != 0) { try { FileSystem fs = FileSystem.get(conf); Path inputFilePath = new Path(inputPath); ContentSummary cSummary = fs.getContentSummary(inputFilePath); long length = cSummary.getLength(); fs.close(); conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf((length) / options.getPartitionNumber())); conf.set("mapreduce.input.fileinputformat.split.minsize", String.valueOf((length) / options.getPartitionNumber())); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); LOG.error(e.toString()); System.exit(1); } } //Job job = new Job(conf,"BigBWA_"+outputPath); Job job = Job.getInstance(conf, "BigBWA_" + outputPath); job.setJarByClass(BigBWA.class); job.setMapperClass(BigBWAMap.class); //job.setCombinerClass(BigBWACombiner.class); if (useReducer) { job.setReducerClass(BigBWAReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1); } else { job.setNumReduceTasks(0); } job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.bigstep.datalake.JsonUtil.java
License:Apache License
/** Convert a ContentSummary to a Json string. */ public static String toJsonString(final ContentSummary contentsummary) { if (contentsummary == null) { return null; }/*from ww w . ja v a2s . com*/ final Map<String, Object> m = new TreeMap<String, Object>(); m.put("length", contentsummary.getLength()); m.put("fileCount", contentsummary.getFileCount()); m.put("directoryCount", contentsummary.getDirectoryCount()); m.put("quota", contentsummary.getQuota()); m.put("spaceConsumed", contentsummary.getSpaceConsumed()); m.put("spaceQuota", contentsummary.getSpaceQuota()); return toJsonString(ContentSummary.class, m); }
From source file:com.github.seqware.queryengine.tutorial.Poster.java
License:Open Source License
private long convertToGB(ContentSummary contentSummary) { // odd, it seems like length reports the equivalent of "hadoop fs -du -s long spaceConsumedinGB = contentSummary.getLength() / 1024 / 1024 / 1024; return spaceConsumedinGB; }
From source file:com.jkoolcloud.tnt4j.streams.inputs.HdfsFileLineStream.java
License:Apache License
private static int[] getFilesTotals(FileSystem fs, Path[] activityFiles) { int tbc = 0;/*from www.jav a 2 s . co m*/ int tlc = 0; if (ArrayUtils.isNotEmpty(activityFiles)) { for (Path f : activityFiles) { try { ContentSummary cSummary = fs.getContentSummary(f); tbc += cSummary.getLength(); tlc += Utils.countLines(fs.open(f)); } catch (IOException exc) { } } } return new int[] { tbc, tlc }; }
From source file:org.apache.falcon.regression.core.util.AssertUtil.java
License:Apache License
/** * Checks size of the content a two locations. * * @param firstPath path to the first location * @param secondPath path to the second location * @param fs hadoop file system for the locations * @throws IOException//from w w w . j a va2s .c om */ public static void checkContentSize(String firstPath, String secondPath, FileSystem fs) throws IOException { final ContentSummary firstSummary = fs.getContentSummary(new Path(firstPath)); final ContentSummary secondSummary = fs.getContentSummary(new Path(secondPath)); LOGGER.info(firstPath + " : firstSummary = " + firstSummary.toString(false)); LOGGER.info(secondPath + " : secondSummary = " + secondSummary.toString(false)); Assert.assertEquals(firstSummary.getLength(), secondSummary.getLength(), "Contents at the two locations don't have same size."); }
From source file:org.apache.falcon.regression.ExternalFSTest.java
License:Apache License
@Test(dataProvider = "getData") public void replicateToExternalFS(final FileSystem externalFS, final String separator, final boolean withData) throws Exception { final String endpoint = externalFS.getUri().toString(); Bundle.submitCluster(bundles[0], externalBundle); String startTime = TimeUtil.getTimeWrtSystemTime(0); String endTime = TimeUtil.addMinsToTime(startTime, 5); LOGGER.info("Time range between : " + startTime + " and " + endTime); String datePattern = StringUtils .join(new String[] { "${YEAR}", "${MONTH}", "${DAY}", "${HOUR}", "${MINUTE}" }, separator); //configure feed FeedMerlin feed = new FeedMerlin(bundles[0].getDataSets().get(0)); String targetDataLocation = endpoint + testWasbTargetDir + datePattern; feed.setFilePath(sourcePath + '/' + datePattern); //erase all clusters from feed definition feed.clearFeedClusters();/*from w w w . ja v a2s .co m*/ //set local cluster as source feed.addFeedCluster(new FeedMerlin.FeedClusterBuilder(Util.readEntityName(bundles[0].getClusters().get(0))) .withRetention("days(1000000)", ActionType.DELETE).withValidity(startTime, endTime) .withClusterType(ClusterType.SOURCE).build()); //set externalFS cluster as target feed.addFeedCluster( new FeedMerlin.FeedClusterBuilder(Util.readEntityName(externalBundle.getClusters().get(0))) .withRetention("days(1000000)", ActionType.DELETE).withValidity(startTime, endTime) .withClusterType(ClusterType.TARGET).withDataLocation(targetDataLocation).build()); //submit and schedule feed LOGGER.info("Feed : " + Util.prettyPrintXml(feed.toString())); AssertUtil.assertSucceeded(prism.getFeedHelper().submitAndSchedule(feed.toString())); datePattern = StringUtils.join(new String[] { "yyyy", "MM", "dd", "HH", "mm" }, separator); //upload necessary data DateTime date = new DateTime(startTime, DateTimeZone.UTC); DateTimeFormatter fmt = DateTimeFormat.forPattern(datePattern); String timePattern = fmt.print(date); HadoopUtil.recreateDir(clusterFS, sourcePath + '/' + timePattern); if (withData) { HadoopUtil.copyDataToFolder(clusterFS, sourcePath + '/' + timePattern, OSUtil.SINGLE_FILE); } Path srcPath = new Path(sourcePath + '/' + timePattern); Path dstPath = new Path(endpoint + testWasbTargetDir + '/' + timePattern); //check if coordinator exists TimeUtil.sleepSeconds(10); InstanceUtil.waitTillInstancesAreCreated(clusterOC, feed.toString(), 0); Assert.assertEquals(OozieUtil.checkIfFeedCoordExist(clusterOC, feed.getName(), "REPLICATION"), 1); //replication should start, wait while it ends InstanceUtil.waitTillInstanceReachState(clusterOC, Util.readEntityName(feed.toString()), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.FEED); //check if data has been replicated correctly List<Path> cluster1ReplicatedData = HadoopUtil.getAllFilesRecursivelyHDFS(clusterFS, srcPath); List<Path> cluster2ReplicatedData = HadoopUtil.getAllFilesRecursivelyHDFS(externalFS, dstPath); AssertUtil.checkForListSizes(cluster1ReplicatedData, cluster2ReplicatedData); final ContentSummary srcSummary = clusterFS.getContentSummary(srcPath); final ContentSummary dstSummary = externalFS.getContentSummary(dstPath); Assert.assertEquals(dstSummary.getLength(), srcSummary.getLength()); }
From source file:org.apache.falcon.regression.hcat.HCatProcessTest.java
License:Apache License
@Test(dataProvider = "generateSeparators") public void twoHCatInputOneHCatOutput(String separator) throws Exception { /* upload data and create partition */ final String datePattern = StringUtils.join(new String[] { "yyyy", "MM", "dd", "HH" }, separator); List<String> dataDates = getDatesList(startDate, endDate, datePattern, 60); final List<String> dataset = HadoopUtil.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir, dataDates);// w w w . j av a 2 s .c o m final List<String> dataset2 = HadoopUtil.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir2, dataDates); ArrayList<HCatFieldSchema> cols = new ArrayList<>(); cols.add(HCatUtil.getStringSchema(col1Name, col1Name + " comment")); cols.add(HCatUtil.getStringSchema(col2Name, col2Name + " comment")); ArrayList<HCatFieldSchema> partitionCols = new ArrayList<>(); partitionCols.add(HCatUtil.getStringSchema(partitionColumn, partitionColumn + " partition")); clusterHC.createTable(HCatCreateTableDesc.create(dbName, inputTableName, cols).partCols(partitionCols) .ifNotExists(true).isTableExternal(true).location(inputHDFSDir).build()); clusterHC.createTable(HCatCreateTableDesc.create(dbName, inputTableName2, cols).partCols(partitionCols) .ifNotExists(true).isTableExternal(true).location(inputHDFSDir2).build()); clusterHC.createTable(HCatCreateTableDesc.create(dbName, outputTableName, cols).partCols(partitionCols) .ifNotExists(true).isTableExternal(true).location(outputHDFSDir).build()); addPartitionsToTable(dataDates, dataset, "dt", dbName, inputTableName); addPartitionsToTable(dataDates, dataset2, "dt", dbName, inputTableName2); final String tableUriPartitionFragment = StringUtils .join(new String[] { "#dt=${YEAR}", "${MONTH}", "${DAY}", "${HOUR}" }, separator); String inputTableUri = "catalog:" + dbName + ":" + inputTableName + tableUriPartitionFragment; String inputTableUri2 = "catalog:" + dbName + ":" + inputTableName2 + tableUriPartitionFragment; bundles[0].setInputFeedTableUri(inputTableUri); bundles[0].setInputFeedPeriodicity(1, Frequency.TimeUnit.hours); bundles[0].setInputFeedValidity(startDate, endDate); final String inputFeed1 = bundles[0].getInputFeedFromBundle(); final String inputFeed2Name = Util.readEntityName(inputFeed1) + "-second"; FeedMerlin feedObj = new FeedMerlin(inputFeed1); feedObj.setName(inputFeed2Name); feedObj.getTable().setUri(inputTableUri2); bundles[0].addInputFeedToBundle("inputData2", feedObj); String outputTableUri = "catalog:" + dbName + ":" + outputTableName + tableUriPartitionFragment; bundles[0].setOutputFeedTableUri(outputTableUri); bundles[0].setOutputFeedPeriodicity(1, Frequency.TimeUnit.hours); bundles[0].setOutputFeedValidity(startDate, endDate); bundles[0].setProcessValidity(startDate, endDate); bundles[0].setProcessPeriodicity(1, Frequency.TimeUnit.hours); bundles[0].setProcessInputStartEnd("now(0,0)", "now(0,0)"); bundles[0].setProcessWorkflow(hiveScriptTwoHCatInputOneHCatOutput, EngineType.HIVE); bundles[0].submitFeedsScheduleProcess(); InstanceUtil.waitTillInstanceReachState(clusterOC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.PROCESS); final ContentSummary inputContentSummary = clusterFS .getContentSummary(new Path(inputHDFSDir + "/" + dataDates.get(0))); final ContentSummary inputContentSummary2 = clusterFS .getContentSummary(new Path(inputHDFSDir2 + "/" + dataDates.get(0))); final ContentSummary outputContentSummary = clusterFS .getContentSummary(new Path(outputHDFSDir + "/dt=" + dataDates.get(0))); LOGGER.info("inputContentSummary = " + inputContentSummary.toString(false)); LOGGER.info("inputContentSummary2 = " + inputContentSummary2.toString(false)); LOGGER.info("outputContentSummary = " + outputContentSummary.toString(false)); Assert.assertEquals(inputContentSummary.getLength() + inputContentSummary2.getLength(), outputContentSummary.getLength(), "Unexpected size of the output."); }
From source file:org.apache.falcon.regression.hcat.HCatProcessTest.java
License:Apache License
@Test(dataProvider = "generateSeparators") public void twoHCatInputTwoHCatOutput(String separator) throws Exception { /* upload data and create partition */ final String datePattern = StringUtils.join(new String[] { "yyyy", "MM", "dd", "HH" }, separator); List<String> dataDates = getDatesList(startDate, endDate, datePattern, 60); final List<String> dataset = HadoopUtil.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir, dataDates);/*from w ww .j av a 2 s .c om*/ final List<String> dataset2 = HadoopUtil.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir2, dataDates); ArrayList<HCatFieldSchema> cols = new ArrayList<>(); cols.add(HCatUtil.getStringSchema(col1Name, col1Name + " comment")); cols.add(HCatUtil.getStringSchema(col2Name, col2Name + " comment")); ArrayList<HCatFieldSchema> partitionCols = new ArrayList<>(); partitionCols.add(HCatUtil.getStringSchema(partitionColumn, partitionColumn + " partition")); clusterHC.createTable(HCatCreateTableDesc.create(dbName, inputTableName, cols).partCols(partitionCols) .ifNotExists(true).isTableExternal(true).location(inputHDFSDir).build()); clusterHC.createTable(HCatCreateTableDesc.create(dbName, inputTableName2, cols).partCols(partitionCols) .ifNotExists(true).isTableExternal(true).location(inputHDFSDir2).build()); clusterHC.createTable(HCatCreateTableDesc.create(dbName, outputTableName, cols).partCols(partitionCols) .ifNotExists(true).isTableExternal(true).location(outputHDFSDir).build()); clusterHC.createTable(HCatCreateTableDesc.create(dbName, outputTableName2, cols).partCols(partitionCols) .ifNotExists(true).isTableExternal(true).location(outputHDFSDir2).build()); addPartitionsToTable(dataDates, dataset, "dt", dbName, inputTableName); addPartitionsToTable(dataDates, dataset2, "dt", dbName, inputTableName2); final String tableUriPartitionFragment = StringUtils .join(new String[] { "#dt=${YEAR}", "${MONTH}", "${DAY}", "${HOUR}" }, separator); String inputTableUri = "catalog:" + dbName + ":" + inputTableName + tableUriPartitionFragment; String inputTableUri2 = "catalog:" + dbName + ":" + inputTableName2 + tableUriPartitionFragment; bundles[0].setInputFeedTableUri(inputTableUri); bundles[0].setInputFeedPeriodicity(1, Frequency.TimeUnit.hours); bundles[0].setInputFeedValidity(startDate, endDate); final String inputFeed1 = bundles[0].getInputFeedFromBundle(); final String inputFeed2Name = Util.readEntityName(inputFeed1) + "-second"; FeedMerlin feedObj = new FeedMerlin(inputFeed1); feedObj.setName(inputFeed2Name); feedObj.getTable().setUri(inputTableUri2); bundles[0].addInputFeedToBundle("inputData2", feedObj); String outputTableUri = "catalog:" + dbName + ":" + outputTableName + tableUriPartitionFragment; String outputTableUri2 = "catalog:" + dbName + ":" + outputTableName2 + tableUriPartitionFragment; bundles[0].setOutputFeedTableUri(outputTableUri); bundles[0].setOutputFeedPeriodicity(1, Frequency.TimeUnit.hours); bundles[0].setOutputFeedValidity(startDate, endDate); final String outputFeed1 = bundles[0].getOutputFeedFromBundle(); final String outputFeed2Name = Util.readEntityName(outputFeed1) + "-second"; FeedMerlin feedObj2 = new FeedMerlin(outputFeed1); feedObj2.setName(outputFeed2Name); feedObj2.getTable().setUri(outputTableUri2); bundles[0].addOutputFeedToBundle("outputData2", feedObj2); bundles[0].setProcessValidity(startDate, endDate); bundles[0].setProcessPeriodicity(1, Frequency.TimeUnit.hours); bundles[0].setProcessInputStartEnd("now(0,0)", "now(0,0)"); bundles[0].setProcessWorkflow(hiveScriptTwoHCatInputTwoHCatOutput, EngineType.HIVE); bundles[0].submitFeedsScheduleProcess(); InstanceUtil.waitTillInstanceReachState(clusterOC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.PROCESS); final ContentSummary inputContentSummary = clusterFS .getContentSummary(new Path(inputHDFSDir + "/" + dataDates.get(0))); final ContentSummary inputContentSummary2 = clusterFS .getContentSummary(new Path(inputHDFSDir2 + "/" + dataDates.get(0))); final ContentSummary outputContentSummary = clusterFS .getContentSummary(new Path(outputHDFSDir + "/dt=" + dataDates.get(0))); final ContentSummary outputContentSummary2 = clusterFS .getContentSummary(new Path(outputHDFSDir2 + "/dt=" + dataDates.get(0))); LOGGER.info("inputContentSummary = " + inputContentSummary.toString(false)); LOGGER.info("inputContentSummary2 = " + inputContentSummary2.toString(false)); LOGGER.info("outputContentSummary = " + outputContentSummary.toString(false)); LOGGER.info("outputContentSummary2 = " + outputContentSummary2.toString(false)); Assert.assertEquals(inputContentSummary.getLength() + inputContentSummary2.getLength(), outputContentSummary.getLength(), "Unexpected size of the output."); Assert.assertEquals(inputContentSummary.getLength() + inputContentSummary2.getLength(), outputContentSummary2.getLength(), "Unexpected size of the output."); }