Example usage for org.apache.hadoop.fs FileSystem getContentSummary

List of usage examples for org.apache.hadoop.fs FileSystem getContentSummary

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getContentSummary.

Prototype

public ContentSummary getContentSummary(Path f) throws IOException 

Source Link

Document

Return the ContentSummary of a given Path .

Usage

From source file:BwaInterpreter.java

License:Open Source License

private void setTotalInputLength() {
    try {/* w  ww  . ja  va  2 s  .com*/
        FileSystem fs = FileSystem.get(this.conf);

        // To get the input files sizes
        ContentSummary cSummaryFile1 = fs.getContentSummary(new Path(options.getInputPath()));

        long lengthFile1 = cSummaryFile1.getLength();
        long lengthFile2 = 0;

        if (!options.getInputPath2().isEmpty()) {
            ContentSummary cSummaryFile2 = fs.getContentSummary(new Path(options.getInputPath()));
            lengthFile2 = cSummaryFile2.getLength();
        }

        // Total size. Depends on paired or single reads
        this.totalInputLength = lengthFile1 + lengthFile2;
        fs.close();
    } catch (IOException e) {
        LOG.error(e.toString());
        e.printStackTrace();
    }
}

From source file:BwaInterpreter.java

License:Open Source License

/**
 * Used to perform the sort operation in HDFS
 * @brief This function provides a method to perform the sort phase in HDFS
 * @author Jos M. Abun//from   w  ww . j  a v a 2 s.com
 * @param fileName1 The first file that contains input FASTQ reads. Stored in HDFS
 * @param fileName2 The second file that contains input FASTQ reads. Stored in HDFS
 * @return A JavaRDD that contains the paired reads sorted
 */
public JavaRDD<Tuple2<String, String>> SortInHDFS2(String fileName1, String fileName2) {

    Configuration conf = this.conf;

    LOG.info("JMAbuin:: Starting writing reads to HDFS");

    try {
        FileSystem fs = FileSystem.get(conf);

        Path outputFilePath = new Path(this.inputTmpFileName);

        //To write the paired reads
        FSDataOutputStream outputFinalStream = fs.create(outputFilePath, true);

        //To read paired reads from both files
        BufferedReader brFastqFile1 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName1))));
        BufferedReader brFastqFile2 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName2))));

        String lineFastq1;
        String lineFastq2;

        lineFastq1 = brFastqFile1.readLine();
        lineFastq2 = brFastqFile2.readLine();

        //Loop to read two files. The two of them must have the same line numbers
        while (lineFastq1 != null) {
            //The lines are written interspersed
            outputFinalStream.write((lineFastq1 + "\n" + lineFastq2 + "\n").getBytes());

            //Next lines are readed
            lineFastq1 = brFastqFile1.readLine();
            lineFastq2 = brFastqFile2.readLine();
        }

        //Close the input and output files
        brFastqFile1.close();
        brFastqFile2.close();
        outputFinalStream.close();

        //Now it is time to read the previous created file and create the RDD
        ContentSummary cSummary = fs.getContentSummary(outputFilePath);

        long length = cSummary.getLength();

        this.totalInputLength = length;

        fs.close();

        //In case of the user does want partitioning
        if (this.options.getPartitionNumber() != 0) {

            //These options are set to indicate the split size and get the correct vnumber of partitions
            this.conf.set("mapreduce.input.fileinputformat.split.maxsize",
                    String.valueOf((length) / this.options.getPartitionNumber()));
            this.conf.set("mapreduce.input.fileinputformat.split.minsize",
                    String.valueOf((length) / this.options.getPartitionNumber()));

            LOG.info("JMAbuin partitioning from HDFS:: "
                    + String.valueOf((length) / this.options.getPartitionNumber()));

            //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD
            return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class,
                    String.class, this.conf).mapPartitions(new BigFastq2RDDPartitionsDouble(), true);

        } else {
            //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD
            return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class,
                    String.class, this.conf).map(new BigFastq2RDDDouble());
        }

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        LOG.error(e.toString());

        return null;
    }
}

From source file:BigBWA.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();

    for (String argumento : args) {
        LOG.info("Arg: " + argumento);
    }//from w  w  w. j  av a 2  s .  c om

    String inputPath = "";
    String outputPath = "";

    boolean useReducer = false;

    BwaOptions options = new BwaOptions(args);

    //We set the timeout and stablish the bwa library to call BWA methods
    conf.set("mapreduce.task.timeout", "0");
    conf.set("mapreduce.map.env", "LD_LIBRARY_PATH=./bwa.zip/");

    //==================Algorithm election==================
    //One of the algorithms is going to be in use, because tge default is always specified.
    if (options.isMemAlgorithm()) {
        //Case of the mem algorithm
        conf.set("mem", "true");
        conf.set("aln", "false");
        conf.set("bwasw", "false");
    }

    else if (options.isAlnAlgorithm()) {
        // Case of aln algorithm
        conf.set("mem", "false");
        conf.set("aln", "true");
        conf.set("bwasw", "false");
    }

    else if (options.isBwaswAlgorithm()) {
        // Case of bwasw algorithm
        conf.set("mem", "false");
        conf.set("aln", "false");
        conf.set("bwasw", "true");
    }

    //==================Index election==================
    if (options.getIndexPath() != "") {
        conf.set("indexRoute", options.getIndexPath());
    } else {
        System.err.println("No index has been found. Aborting.");
        System.exit(1);
    }

    //==================Type of reads election==================
    //There is always going to be a type of reads, because default is paired
    if (options.isPairedReads()) {
        conf.set("paired", "true");
        conf.set("single", "false");
    } else if (options.isSingleReads()) {
        conf.set("paired", "false");
        conf.set("single", "true");
    }

    //==================Use of reducer==================
    if (options.isUseReducer()) {
        useReducer = true;
        conf.set("useReducer", "true");
    } else {
        conf.set("useReducer", "false");
    }

    //==================Number of threads per map==================
    if (options.getNumThreads() != "0") {
        conf.set("bwathreads", options.getNumThreads());
    }

    //==================RG Header===================
    if (options.getReadgroupHeader() != "") {
        conf.set("rgheader", options.getReadgroupHeader());
    }

    //==================Input and output paths==================
    inputPath = options.getInputPath();
    outputPath = options.getOutputPath();

    conf.set("outputGenomics", outputPath);

    //==================Partition number==================
    if (options.getPartitionNumber() != 0) {
        try {
            FileSystem fs = FileSystem.get(conf);

            Path inputFilePath = new Path(inputPath);

            ContentSummary cSummary = fs.getContentSummary(inputFilePath);

            long length = cSummary.getLength();

            fs.close();

            conf.set("mapreduce.input.fileinputformat.split.maxsize",
                    String.valueOf((length) / options.getPartitionNumber()));
            conf.set("mapreduce.input.fileinputformat.split.minsize",
                    String.valueOf((length) / options.getPartitionNumber()));
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            LOG.error(e.toString());

            System.exit(1);
        }

    }

    //Job job = new Job(conf,"BigBWA_"+outputPath);
    Job job = Job.getInstance(conf, "BigBWA_" + outputPath);

    job.setJarByClass(BigBWA.class);
    job.setMapperClass(BigBWAMap.class);
    //job.setCombinerClass(BigBWACombiner.class);

    if (useReducer) {
        job.setReducerClass(BigBWAReducer.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setNumReduceTasks(1);
    } else {
        job.setNumReduceTasks(0);
    }

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.blackberry.logdriver.util.IndexLogs.java

License:Apache License

private static void updateComponent(Map<String, Map<String, Map<String, Map<String, Component>>>> data,
        List<String> unmergedCSVStrings, FileSystem fs, FileStatus matchedFolder, Path path)
        throws IOException, ParseException {
    // Parse path by splitting it across slashes. To determine service (which might contain slashes) grab
    // everything after the DC name, but before the matched date string.
    String[] pathPieces = matchedFolder.getPath().toString().split("/");
    String[] servicePieces = path.toString().split(pathPieces[4] + "/");
    servicePieces = servicePieces[1].split("/" + pathPieces[pathPieces.length - 5]);
    String DC = pathPieces[4];//  w  w w  . j  a  v  a2s.  c  o  m
    String service = servicePieces[0];
    String component = pathPieces[pathPieces.length - 2];
    String type = pathPieces[pathPieces.length - 5];
    String status = pathPieces[pathPieces.length - 1];
    Date date = inputFormat.parse(pathPieces[pathPieces.length - 4]);

    // If the _READY file doesn't exist, add it to the list
    Path READYPath = new Path(path.toString() + "/_READY");
    // System.out.println("Checking for " + READYPath.toString());
    if (!fs.exists(READYPath)) {
        unmergedCSVStrings.add(DC + "," + service + "," + type + "," + component + ","
                + pathPieces[pathPieces.length - 4] + "," + pathPieces[pathPieces.length - 3] + "\n");
        //System.out.println(unmergedCSVString);
    }

    // Check if there is a matching component, create one if not. 
    if (!componentExists(data, DC, service, type, component)) {
        data.get(DC).get(service).get(type).put(component, new Component(DC, service, type, component, date));
    }

    Component thisComponent = data.get(DC).get(service).get(type).get(component);

    // Update the start or end date if the current date is before or after, respectively. 
    if (date.before(thisComponent.startDate)) {
        thisComponent.startDate = date;
    } else if (date.after(thisComponent.endDate)) {
        thisComponent.endDate = date;
    }

    // Is the current folder an archive? If so and date is later than the current archiveDate, update it. 
    if (status.matches("archive") && date.after(thisComponent.archiveDate)) {
        thisComponent.archiveDate = date;
    }

    // Add size data
    if (status.matches("data")) {
        thisComponent.addDataSize(fs.getContentSummary(matchedFolder.getPath()).getLength());
    } else if (status.matches("incoming")) {
        thisComponent.addIncomingSize(fs.getContentSummary(matchedFolder.getPath()).getLength());
    } else if (status.matches("archive")) {
        thisComponent.addArchiveSize(fs.getContentSummary(matchedFolder.getPath()).getLength());
    }
}

From source file:com.blackberry.logdriver.util.LogStats.java

License:Apache License

public static double[] getDataOverTime(FileSystem fs, Component component, Date startDate, Date endDate) {

    if (startDate.after(component.endDate) || endDate.before(component.startDate)) {
        return new double[0];
    }/*from w  w  w  . ja v a 2s .  c  o  m*/

    // If the date range specified overlaps archived data, notify the user 
    if (startDate.before(component.archiveDate)) {
        System.out.println("Warning: Time range specified includes archived data");
    }

    // Set up variable and array. Fill with -1 to indicate if hours are unused at the end. 
    long totalHours = (endDate.getTime() - startDate.getTime()) / oneHour;
    int logVolumesIndex = 0;
    double[] logVolumes = new double[(int) totalHours];
    String basePath = "/service/" + component.DC + "/" + component.service + "/" + component.type + "/";

    for (Long currentDate = startDate.getTime(); currentDate < endDate.getTime(); currentDate += oneHour) {
        @SuppressWarnings("deprecation")
        String dateAndHour = inputFormat.format(new Date(currentDate)) + "/"
                + String.format("%02d", new Date(currentDate).getHours()) + "/";
        Path path = new Path(basePath + dateAndHour + component.component);
        if (component.startDate.getTime() - oneDay < currentDate
                && component.endDate.getTime() + oneDay > currentDate) {
            try {
                logVolumes[logVolumesIndex] = fs.getContentSummary(path).getLength();
            } catch (IOException e) {
                logVolumes[logVolumesIndex] = 0;
            }
        } else {
            logVolumes[logVolumesIndex] = 0;
        }
        logVolumesIndex++;
    }
    return logVolumes;
}

From source file:com.blackberry.logtools.LogTools.java

License:Apache License

public long getSize(long foundresults, String tmp, FileSystem fs) throws Exception {
    if (foundresults == 0) {
        logConsole(true, true, error, "No logs found for the given component(s) and time range.");
        System.exit(1);//from   w ww  . j  av a 2s .c o  m
    }
    return fs.getContentSummary(new Path(tmp + "/rawlines")).getLength();
}

From source file:com.blm.orc.FileDump.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    List<String> files = new ArrayList<String>();
    List<Integer> rowIndexCols = null;
    for (String arg : args) {
        if (arg.startsWith("--")) {
            if (arg.startsWith(ROWINDEX_PREFIX)) {
                String[] colStrs = arg.substring(ROWINDEX_PREFIX.length()).split(",");
                rowIndexCols = new ArrayList<Integer>(colStrs.length);
                for (String colStr : colStrs) {
                    rowIndexCols.add(Integer.parseInt(colStr));
                }//from  ww w.  jav a  2 s.  c  om
            } else {
                System.err.println("Unknown argument " + arg);
            }
        } else {
            files.add(arg);
        }
    }

    for (String filename : files) {
        System.out.println("Structure for " + filename);
        Path path = new Path(filename);
        Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
        System.out.println(
                "File Version: " + reader.getFileVersion().getName() + " with " + reader.getWriterVersion());
        RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
        System.out.println("Rows: " + reader.getNumberOfRows());
        System.out.println("Compression: " + reader.getCompression());
        if (reader.getCompression() != CompressionKind.NONE) {
            System.out.println("Compression size: " + reader.getCompressionSize());
        }
        System.out.println("Type: " + reader.getObjectInspector().getTypeName());
        System.out.println("\nStripe Statistics:");
        Metadata metadata = reader.getMetadata();
        for (int n = 0; n < metadata.getStripeStatistics().size(); n++) {
            System.out.println("  Stripe " + (n + 1) + ":");
            StripeStatistics ss = metadata.getStripeStatistics().get(n);
            for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
                System.out.println("    Column " + i + ": " + ss.getColumnStatistics()[i].toString());
            }
        }
        ColumnStatistics[] stats = reader.getStatistics();
        System.out.println("\nFile Statistics:");
        for (int i = 0; i < stats.length; ++i) {
            System.out.println("  Column " + i + ": " + stats[i].toString());
        }
        System.out.println("\nStripes:");
        int stripeIx = -1;
        for (StripeInformation stripe : reader.getStripes()) {
            ++stripeIx;
            long stripeStart = stripe.getOffset();
            System.out.println("  Stripe: " + stripe.toString());
            OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
            long sectionStart = stripeStart;
            for (OrcProto.Stream section : footer.getStreamsList()) {
                System.out.println("    Stream: column " + section.getColumn() + " section " + section.getKind()
                        + " start: " + sectionStart + " length " + section.getLength());
                sectionStart += section.getLength();
            }
            for (int i = 0; i < footer.getColumnsCount(); ++i) {
                OrcProto.ColumnEncoding encoding = footer.getColumns(i);
                StringBuilder buf = new StringBuilder();
                buf.append("    Encoding column ");
                buf.append(i);
                buf.append(": ");
                buf.append(encoding.getKind());
                if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY
                        || encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
                    buf.append("[");
                    buf.append(encoding.getDictionarySize());
                    buf.append("]");
                }
                System.out.println(buf);
            }
            if (rowIndexCols != null) {
                RowIndex[] indices = rows.readRowIndex(stripeIx);
                for (int col : rowIndexCols) {
                    StringBuilder buf = new StringBuilder();
                    buf.append("    Row group index column ").append(col).append(":");
                    RowIndex index = null;
                    if ((col >= indices.length) || ((index = indices[col]) == null)) {
                        buf.append(" not found\n");
                        continue;
                    }
                    for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
                        buf.append("\n      Entry ").append(entryIx).append(":");
                        RowIndexEntry entry = index.getEntry(entryIx);
                        if (entry == null) {
                            buf.append("unknown\n");
                            continue;
                        }
                        OrcProto.ColumnStatistics colStats = entry.getStatistics();
                        if (colStats == null) {
                            buf.append("no stats at ");
                        } else {
                            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
                            Object min = RecordReaderImpl.getMin(cs), max = RecordReaderImpl.getMax(cs);
                            buf.append(" count: ").append(cs.getNumberOfValues());
                            buf.append(" min: ").append(min);
                            buf.append(" max: ").append(max);
                        }
                        buf.append(" positions: ");
                        for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
                            if (posIx != 0) {
                                buf.append(",");
                            }
                            buf.append(entry.getPositions(posIx));
                        }
                    }
                    System.out.println(buf);
                }
            }
        }

        FileSystem fs = path.getFileSystem(conf);
        long fileLen = fs.getContentSummary(path).getLength();
        long paddedBytes = getTotalPaddingSize(reader);
        // empty ORC file is ~45 bytes. Assumption here is file length always >0
        double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
        DecimalFormat format = new DecimalFormat("##.##");
        System.out.println("\nFile length: " + fileLen + " bytes");
        System.out.println("Padding length: " + paddedBytes + " bytes");
        System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
        rows.close();
    }
}

From source file:com.cloudera.impala.catalog.HBaseTable.java

License:Apache License

/**
 * Returns the Hdfs size of the given region in bytes. NULL can be
 * passed as a parameter to retrieve the size of the complete table.
 *//* www .  j ava 2s.  co  m*/
public long getHdfsSize(HRegionInfo info) throws IOException {
    Path tableDir = HTableDescriptor.getTableDir(FSUtils.getRootDir(hbaseConf_),
            Bytes.toBytes(hbaseTableName_));
    FileSystem fs = tableDir.getFileSystem(hbaseConf_);
    if (info != null) {
        Path regionDir = tableDir.suffix("/" + info.getEncodedName());
        return fs.getContentSummary(regionDir).getLength();
    } else {
        return fs.getContentSummary(tableDir).getLength();
    }
}

From source file:com.cloudera.sqoop.TestTargetDir.java

License:Apache License

/** test target-dir contains imported files. */
public void testTargetDir() throws IOException {

    try {//from   w w w.ja  v  a  2s. c o  m
        String targetDir = getWarehouseDir() + "/tempTargetDir";

        ArrayList args = getOutputArgv(true);
        args.add("--target-dir");
        args.add(targetDir);

        // delete target-dir if exists and recreate it
        FileSystem fs = FileSystem.get(getConf());
        Path outputPath = new Path(targetDir);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }

        String[] argv = (String[]) args.toArray(new String[0]);
        runImport(argv);

        ContentSummary summ = fs.getContentSummary(outputPath);

        assertTrue("There's no new imported files in target-dir", summ.getFileCount() > 0);

    } catch (Exception e) {
        LOG.error("Got Exception: " + StringUtils.stringifyException(e));
        fail(e.toString());
    }
}

From source file:com.dasasian.chok.command.ListIndicesCommand.java

License:Apache License

private long calculateIndexDiskUsage(String index) {
    Path indexPath = new Path(index);
    URI indexUri = indexPath.toUri();
    try {/*from  w w w . j av a  2 s  . c om*/
        FileSystem fileSystem = FileSystem.get(indexUri, new Configuration());
        if (!fileSystem.exists(indexPath)) {
            return -1;
        }
        return fileSystem.getContentSummary(indexPath).getLength();
    } catch (Exception e) {
        return -1;
    }
}