List of usage examples for org.apache.hadoop.fs FileSystem getContentSummary
public ContentSummary getContentSummary(Path f) throws IOException
From source file:BwaInterpreter.java
License:Open Source License
private void setTotalInputLength() { try {/* w ww . ja va 2 s .com*/ FileSystem fs = FileSystem.get(this.conf); // To get the input files sizes ContentSummary cSummaryFile1 = fs.getContentSummary(new Path(options.getInputPath())); long lengthFile1 = cSummaryFile1.getLength(); long lengthFile2 = 0; if (!options.getInputPath2().isEmpty()) { ContentSummary cSummaryFile2 = fs.getContentSummary(new Path(options.getInputPath())); lengthFile2 = cSummaryFile2.getLength(); } // Total size. Depends on paired or single reads this.totalInputLength = lengthFile1 + lengthFile2; fs.close(); } catch (IOException e) { LOG.error(e.toString()); e.printStackTrace(); } }
From source file:BwaInterpreter.java
License:Open Source License
/** * Used to perform the sort operation in HDFS * @brief This function provides a method to perform the sort phase in HDFS * @author Jos M. Abun//from w ww . j a v a 2 s.com * @param fileName1 The first file that contains input FASTQ reads. Stored in HDFS * @param fileName2 The second file that contains input FASTQ reads. Stored in HDFS * @return A JavaRDD that contains the paired reads sorted */ public JavaRDD<Tuple2<String, String>> SortInHDFS2(String fileName1, String fileName2) { Configuration conf = this.conf; LOG.info("JMAbuin:: Starting writing reads to HDFS"); try { FileSystem fs = FileSystem.get(conf); Path outputFilePath = new Path(this.inputTmpFileName); //To write the paired reads FSDataOutputStream outputFinalStream = fs.create(outputFilePath, true); //To read paired reads from both files BufferedReader brFastqFile1 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName1)))); BufferedReader brFastqFile2 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName2)))); String lineFastq1; String lineFastq2; lineFastq1 = brFastqFile1.readLine(); lineFastq2 = brFastqFile2.readLine(); //Loop to read two files. The two of them must have the same line numbers while (lineFastq1 != null) { //The lines are written interspersed outputFinalStream.write((lineFastq1 + "\n" + lineFastq2 + "\n").getBytes()); //Next lines are readed lineFastq1 = brFastqFile1.readLine(); lineFastq2 = brFastqFile2.readLine(); } //Close the input and output files brFastqFile1.close(); brFastqFile2.close(); outputFinalStream.close(); //Now it is time to read the previous created file and create the RDD ContentSummary cSummary = fs.getContentSummary(outputFilePath); long length = cSummary.getLength(); this.totalInputLength = length; fs.close(); //In case of the user does want partitioning if (this.options.getPartitionNumber() != 0) { //These options are set to indicate the split size and get the correct vnumber of partitions this.conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf((length) / this.options.getPartitionNumber())); this.conf.set("mapreduce.input.fileinputformat.split.minsize", String.valueOf((length) / this.options.getPartitionNumber())); LOG.info("JMAbuin partitioning from HDFS:: " + String.valueOf((length) / this.options.getPartitionNumber())); //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class, String.class, this.conf).mapPartitions(new BigFastq2RDDPartitionsDouble(), true); } else { //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class, String.class, this.conf).map(new BigFastq2RDDDouble()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); LOG.error(e.toString()); return null; } }
From source file:BigBWA.java
License:Open Source License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); for (String argumento : args) { LOG.info("Arg: " + argumento); }//from w w w. j av a 2 s . c om String inputPath = ""; String outputPath = ""; boolean useReducer = false; BwaOptions options = new BwaOptions(args); //We set the timeout and stablish the bwa library to call BWA methods conf.set("mapreduce.task.timeout", "0"); conf.set("mapreduce.map.env", "LD_LIBRARY_PATH=./bwa.zip/"); //==================Algorithm election================== //One of the algorithms is going to be in use, because tge default is always specified. if (options.isMemAlgorithm()) { //Case of the mem algorithm conf.set("mem", "true"); conf.set("aln", "false"); conf.set("bwasw", "false"); } else if (options.isAlnAlgorithm()) { // Case of aln algorithm conf.set("mem", "false"); conf.set("aln", "true"); conf.set("bwasw", "false"); } else if (options.isBwaswAlgorithm()) { // Case of bwasw algorithm conf.set("mem", "false"); conf.set("aln", "false"); conf.set("bwasw", "true"); } //==================Index election================== if (options.getIndexPath() != "") { conf.set("indexRoute", options.getIndexPath()); } else { System.err.println("No index has been found. Aborting."); System.exit(1); } //==================Type of reads election================== //There is always going to be a type of reads, because default is paired if (options.isPairedReads()) { conf.set("paired", "true"); conf.set("single", "false"); } else if (options.isSingleReads()) { conf.set("paired", "false"); conf.set("single", "true"); } //==================Use of reducer================== if (options.isUseReducer()) { useReducer = true; conf.set("useReducer", "true"); } else { conf.set("useReducer", "false"); } //==================Number of threads per map================== if (options.getNumThreads() != "0") { conf.set("bwathreads", options.getNumThreads()); } //==================RG Header=================== if (options.getReadgroupHeader() != "") { conf.set("rgheader", options.getReadgroupHeader()); } //==================Input and output paths================== inputPath = options.getInputPath(); outputPath = options.getOutputPath(); conf.set("outputGenomics", outputPath); //==================Partition number================== if (options.getPartitionNumber() != 0) { try { FileSystem fs = FileSystem.get(conf); Path inputFilePath = new Path(inputPath); ContentSummary cSummary = fs.getContentSummary(inputFilePath); long length = cSummary.getLength(); fs.close(); conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf((length) / options.getPartitionNumber())); conf.set("mapreduce.input.fileinputformat.split.minsize", String.valueOf((length) / options.getPartitionNumber())); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); LOG.error(e.toString()); System.exit(1); } } //Job job = new Job(conf,"BigBWA_"+outputPath); Job job = Job.getInstance(conf, "BigBWA_" + outputPath); job.setJarByClass(BigBWA.class); job.setMapperClass(BigBWAMap.class); //job.setCombinerClass(BigBWACombiner.class); if (useReducer) { job.setReducerClass(BigBWAReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1); } else { job.setNumReduceTasks(0); } job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.blackberry.logdriver.util.IndexLogs.java
License:Apache License
private static void updateComponent(Map<String, Map<String, Map<String, Map<String, Component>>>> data, List<String> unmergedCSVStrings, FileSystem fs, FileStatus matchedFolder, Path path) throws IOException, ParseException { // Parse path by splitting it across slashes. To determine service (which might contain slashes) grab // everything after the DC name, but before the matched date string. String[] pathPieces = matchedFolder.getPath().toString().split("/"); String[] servicePieces = path.toString().split(pathPieces[4] + "/"); servicePieces = servicePieces[1].split("/" + pathPieces[pathPieces.length - 5]); String DC = pathPieces[4];// w w w . j a v a2s. c o m String service = servicePieces[0]; String component = pathPieces[pathPieces.length - 2]; String type = pathPieces[pathPieces.length - 5]; String status = pathPieces[pathPieces.length - 1]; Date date = inputFormat.parse(pathPieces[pathPieces.length - 4]); // If the _READY file doesn't exist, add it to the list Path READYPath = new Path(path.toString() + "/_READY"); // System.out.println("Checking for " + READYPath.toString()); if (!fs.exists(READYPath)) { unmergedCSVStrings.add(DC + "," + service + "," + type + "," + component + "," + pathPieces[pathPieces.length - 4] + "," + pathPieces[pathPieces.length - 3] + "\n"); //System.out.println(unmergedCSVString); } // Check if there is a matching component, create one if not. if (!componentExists(data, DC, service, type, component)) { data.get(DC).get(service).get(type).put(component, new Component(DC, service, type, component, date)); } Component thisComponent = data.get(DC).get(service).get(type).get(component); // Update the start or end date if the current date is before or after, respectively. if (date.before(thisComponent.startDate)) { thisComponent.startDate = date; } else if (date.after(thisComponent.endDate)) { thisComponent.endDate = date; } // Is the current folder an archive? If so and date is later than the current archiveDate, update it. if (status.matches("archive") && date.after(thisComponent.archiveDate)) { thisComponent.archiveDate = date; } // Add size data if (status.matches("data")) { thisComponent.addDataSize(fs.getContentSummary(matchedFolder.getPath()).getLength()); } else if (status.matches("incoming")) { thisComponent.addIncomingSize(fs.getContentSummary(matchedFolder.getPath()).getLength()); } else if (status.matches("archive")) { thisComponent.addArchiveSize(fs.getContentSummary(matchedFolder.getPath()).getLength()); } }
From source file:com.blackberry.logdriver.util.LogStats.java
License:Apache License
public static double[] getDataOverTime(FileSystem fs, Component component, Date startDate, Date endDate) { if (startDate.after(component.endDate) || endDate.before(component.startDate)) { return new double[0]; }/*from w w w . ja v a 2s . c o m*/ // If the date range specified overlaps archived data, notify the user if (startDate.before(component.archiveDate)) { System.out.println("Warning: Time range specified includes archived data"); } // Set up variable and array. Fill with -1 to indicate if hours are unused at the end. long totalHours = (endDate.getTime() - startDate.getTime()) / oneHour; int logVolumesIndex = 0; double[] logVolumes = new double[(int) totalHours]; String basePath = "/service/" + component.DC + "/" + component.service + "/" + component.type + "/"; for (Long currentDate = startDate.getTime(); currentDate < endDate.getTime(); currentDate += oneHour) { @SuppressWarnings("deprecation") String dateAndHour = inputFormat.format(new Date(currentDate)) + "/" + String.format("%02d", new Date(currentDate).getHours()) + "/"; Path path = new Path(basePath + dateAndHour + component.component); if (component.startDate.getTime() - oneDay < currentDate && component.endDate.getTime() + oneDay > currentDate) { try { logVolumes[logVolumesIndex] = fs.getContentSummary(path).getLength(); } catch (IOException e) { logVolumes[logVolumesIndex] = 0; } } else { logVolumes[logVolumesIndex] = 0; } logVolumesIndex++; } return logVolumes; }
From source file:com.blackberry.logtools.LogTools.java
License:Apache License
public long getSize(long foundresults, String tmp, FileSystem fs) throws Exception { if (foundresults == 0) { logConsole(true, true, error, "No logs found for the given component(s) and time range."); System.exit(1);//from w ww . j av a 2s .c o m } return fs.getContentSummary(new Path(tmp + "/rawlines")).getLength(); }
From source file:com.blm.orc.FileDump.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); List<String> files = new ArrayList<String>(); List<Integer> rowIndexCols = null; for (String arg : args) { if (arg.startsWith("--")) { if (arg.startsWith(ROWINDEX_PREFIX)) { String[] colStrs = arg.substring(ROWINDEX_PREFIX.length()).split(","); rowIndexCols = new ArrayList<Integer>(colStrs.length); for (String colStr : colStrs) { rowIndexCols.add(Integer.parseInt(colStr)); }//from ww w. jav a 2 s. c om } else { System.err.println("Unknown argument " + arg); } } else { files.add(arg); } } for (String filename : files) { System.out.println("Structure for " + filename); Path path = new Path(filename); Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); System.out.println( "File Version: " + reader.getFileVersion().getName() + " with " + reader.getWriterVersion()); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); System.out.println("Rows: " + reader.getNumberOfRows()); System.out.println("Compression: " + reader.getCompression()); if (reader.getCompression() != CompressionKind.NONE) { System.out.println("Compression size: " + reader.getCompressionSize()); } System.out.println("Type: " + reader.getObjectInspector().getTypeName()); System.out.println("\nStripe Statistics:"); Metadata metadata = reader.getMetadata(); for (int n = 0; n < metadata.getStripeStatistics().size(); n++) { System.out.println(" Stripe " + (n + 1) + ":"); StripeStatistics ss = metadata.getStripeStatistics().get(n); for (int i = 0; i < ss.getColumnStatistics().length; ++i) { System.out.println(" Column " + i + ": " + ss.getColumnStatistics()[i].toString()); } } ColumnStatistics[] stats = reader.getStatistics(); System.out.println("\nFile Statistics:"); for (int i = 0; i < stats.length; ++i) { System.out.println(" Column " + i + ": " + stats[i].toString()); } System.out.println("\nStripes:"); int stripeIx = -1; for (StripeInformation stripe : reader.getStripes()) { ++stripeIx; long stripeStart = stripe.getOffset(); System.out.println(" Stripe: " + stripe.toString()); OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); long sectionStart = stripeStart; for (OrcProto.Stream section : footer.getStreamsList()) { System.out.println(" Stream: column " + section.getColumn() + " section " + section.getKind() + " start: " + sectionStart + " length " + section.getLength()); sectionStart += section.getLength(); } for (int i = 0; i < footer.getColumnsCount(); ++i) { OrcProto.ColumnEncoding encoding = footer.getColumns(i); StringBuilder buf = new StringBuilder(); buf.append(" Encoding column "); buf.append(i); buf.append(": "); buf.append(encoding.getKind()); if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { buf.append("["); buf.append(encoding.getDictionarySize()); buf.append("]"); } System.out.println(buf); } if (rowIndexCols != null) { RowIndex[] indices = rows.readRowIndex(stripeIx); for (int col : rowIndexCols) { StringBuilder buf = new StringBuilder(); buf.append(" Row group index column ").append(col).append(":"); RowIndex index = null; if ((col >= indices.length) || ((index = indices[col]) == null)) { buf.append(" not found\n"); continue; } for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) { buf.append("\n Entry ").append(entryIx).append(":"); RowIndexEntry entry = index.getEntry(entryIx); if (entry == null) { buf.append("unknown\n"); continue; } OrcProto.ColumnStatistics colStats = entry.getStatistics(); if (colStats == null) { buf.append("no stats at "); } else { ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats); Object min = RecordReaderImpl.getMin(cs), max = RecordReaderImpl.getMax(cs); buf.append(" count: ").append(cs.getNumberOfValues()); buf.append(" min: ").append(min); buf.append(" max: ").append(max); } buf.append(" positions: "); for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) { if (posIx != 0) { buf.append(","); } buf.append(entry.getPositions(posIx)); } } System.out.println(buf); } } } FileSystem fs = path.getFileSystem(conf); long fileLen = fs.getContentSummary(path).getLength(); long paddedBytes = getTotalPaddingSize(reader); // empty ORC file is ~45 bytes. Assumption here is file length always >0 double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; DecimalFormat format = new DecimalFormat("##.##"); System.out.println("\nFile length: " + fileLen + " bytes"); System.out.println("Padding length: " + paddedBytes + " bytes"); System.out.println("Padding ratio: " + format.format(percentPadding) + "%"); rows.close(); } }
From source file:com.cloudera.impala.catalog.HBaseTable.java
License:Apache License
/** * Returns the Hdfs size of the given region in bytes. NULL can be * passed as a parameter to retrieve the size of the complete table. *//* www . j ava 2s. co m*/ public long getHdfsSize(HRegionInfo info) throws IOException { Path tableDir = HTableDescriptor.getTableDir(FSUtils.getRootDir(hbaseConf_), Bytes.toBytes(hbaseTableName_)); FileSystem fs = tableDir.getFileSystem(hbaseConf_); if (info != null) { Path regionDir = tableDir.suffix("/" + info.getEncodedName()); return fs.getContentSummary(regionDir).getLength(); } else { return fs.getContentSummary(tableDir).getLength(); } }
From source file:com.cloudera.sqoop.TestTargetDir.java
License:Apache License
/** test target-dir contains imported files. */ public void testTargetDir() throws IOException { try {//from w w w.ja v a 2s. c o m String targetDir = getWarehouseDir() + "/tempTargetDir"; ArrayList args = getOutputArgv(true); args.add("--target-dir"); args.add(targetDir); // delete target-dir if exists and recreate it FileSystem fs = FileSystem.get(getConf()); Path outputPath = new Path(targetDir); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } String[] argv = (String[]) args.toArray(new String[0]); runImport(argv); ContentSummary summ = fs.getContentSummary(outputPath); assertTrue("There's no new imported files in target-dir", summ.getFileCount() > 0); } catch (Exception e) { LOG.error("Got Exception: " + StringUtils.stringifyException(e)); fail(e.toString()); } }
From source file:com.dasasian.chok.command.ListIndicesCommand.java
License:Apache License
private long calculateIndexDiskUsage(String index) { Path indexPath = new Path(index); URI indexUri = indexPath.toUri(); try {/*from w w w . j av a 2 s . c om*/ FileSystem fileSystem = FileSystem.get(indexUri, new Configuration()); if (!fileSystem.exists(indexPath)) { return -1; } return fileSystem.getContentSummary(indexPath).getLength(); } catch (Exception e) { return -1; } }