Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line.//w w  w .  ja v  a 2  s.  c  om
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, (int) Math.min(maxLineLength, end - start));

    // ok, so first, split/unsplit, compressed/uncompressed notwithstanding,
    // there are three cases we can run into:
    //
    // 1. we read data
    // 2. we are at an acceptable eof/end-of-split and don't read data
    // 3. we are at an unacceptable eof/end-of-split and don't read data
    //
    // cases 1 and 2 are consistent across split/unsplit, compressed/uncompressed.
    //
    // case 3 is simple in the unsplit or uncompressed cases; something has
    // gone wrong, we throw an EOFException, and move on with our lives
    //
    // case 3 is where working with split compressed files gets fun.
    //
    // with the split compression stream, the first time we read past the
    // end of the last compression block within a file split, we get no
    // bytes back. the BZip2Codec and BGZFCodec's actually tell us that
    // we'll get -2 back in this case, but we'll cast a wider net yet.
    //
    // this is important information---if we don't know this, we'll keep reading
    // past the end of the split to the end of the file---but we still need to
    // finish reading our multiline record, so we set some state to let us know
    // that we're reading the last record in the split (endOfCompressedSplit)
    // and repeat the read. if the read fails again, then that means that
    // something has actually gone wrong, and we want to fall through and
    // throw an EOFException or return no bytes read (depending on eofOk).
    // that's why we have the lastReadWasZeroBytes flag around. we set this
    // to true on the first read that gets bytesRead <= 0, and clear it on
    // any read that reads more than 0 bytes.
    if (isSplittable && isCompressed && !lastReadWasZeroBytes && bytesRead <= 0 && !eofOk) {

        // we need to clear the reader state so we can continue reading
        ((ResettableCompressedSplitLineReader) lineReader).reset();

        // set the state to stop us from reading another record and
        // to catch back-to-back failed reads
        lastReadWasZeroBytes = true;
        endOfCompressedSplit = true;

        // recursively call to redo the read
        return appendLineInto(dest, eofOk);
    } else if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) {
        throw new EOFException();
    } else {
        lastReadWasZeroBytes = false;
    }

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    if (isSplittable && isCompressed) {
        pos = ((SplitCompressionInputStream) inputStream).getPos();
    } else {
        pos += bytesRead;
    }

    return bytesRead;
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.cram.ChromosomeIndex.java

License:Open Source License

public void loadChromosomeIndex() {
    chromosomeIndexMap = new HashMap<Integer, ChromosomeIndexStructure>();
    Path p = new Path(cramIndexFileName);
    Configuration conf = new Configuration();
    FileSystem fs = null;// ww w  .j  a va 2  s  .com
    try {
        fs = p.getFileSystem(conf);
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try {
        if (!fs.exists(p)) {
            writeIndex(indexForChromosome(new Path(cramFileName)));
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    try {
        FSDataInputStream reader = p.getFileSystem(conf).open(p);
        LineReader lineReader = new LineReader(reader, conf);
        Text line = new Text();
        while (lineReader.readLine(line) > 0) {
            if (line.getLength() == 0)
                continue;
            String[] str = line.toString().split("\t");
            chromosomeIndexMap.put(Integer.parseInt(str[0]), new ChromosomeIndexStructure(line.toString()));
        }

        lineReader.close();
        reader.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.vcf.VCFSplit.java

License:Open Source License

private void readFile(Path p) {
    StringBuilder sb = null;/*  ww  w  .ja va  2s  .co  m*/
    if (!headerHasWrite) {
        sb = new StringBuilder();
    }
    try {
        FSDataInputStream table = fs.open(p);
        LineReader lineReader = new LineReader(table, conf);
        Text line = new Text();
        String tempString = null;
        while (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            tempString = line.toString();
            if (tempString.startsWith(VCFHeaderStartTag)) {
                if (headerHasWrite) {
                    continue;
                }
                sb.append(tempString.trim());
                sb.append("\n");
                if (tempString.startsWith(VCFHeaderEndLineTag)) {
                    writeHeader(sb.toString().trim());
                    headerHasWrite = true;
                }
            } else if (tempString.startsWith(SampleTag)) {
                String sampleName = tempString.split(":")[1];
                currentOutput = sample.get(sampleName);
            } else {
                write("\n");
                if (tempString.startsWith("chr1\t179462149")) {
                    System.out.println("debug:" + tempString);
                }
                write(tempString);
            }
        }
        lineReader.close();
        table.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

}

From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.BamReport.java

License:Open Source License

public static void getOutput(BamQualityControlOptions options, Configuration conf, Path oPath)
        throws IOException {
    ReportBuilder reportBuilder = new ReportBuilder();
    ResultReport reportType;/*from w  ww . ja  va 2 s. co m*/
    ReferenceShare genome = new ReferenceShare();
    genome.loadChromosomeList(options.getReferenceSequencePath());

    if ((options.getRegion() != null) || (options.getBedfile() != null))
        reportType = new RegionResultReport(options, conf);
    else
        reportType = new WholeGenomeResultReport(options);

    Map<String, ResultReport> reports = new ConcurrentHashMap<String, ResultReport>();
    FileSystem fs = oPath.getFileSystem(conf);
    FileStatus filelist[] = fs.listStatus(oPath);
    for (int i = 0; i < filelist.length; i++) {
        if (!filelist[i].isDir() && !filelist[i].getPath().toString().startsWith("_")) {
            FSDataInputStream reader = fs.open(filelist[i].getPath());
            LineReader lineReader = new LineReader(reader, conf);
            Text line = new Text();
            while (lineReader.readLine(line) > 0) {
                String lineString = line.toString();
                if (line.getLength() == 0) {
                    continue;
                }

                if (lineString.contains("sample:")) {
                    String sample = line.toString().split(":")[1];
                    if (!reports.containsKey(sample)) {
                        reports.put(sample, reportType);
                        reportBuilder.setReportChoice(reportType);
                        reportBuilder.initReports(sample);
                    } else {
                        reportType = reports.get(sample);
                        reportBuilder.setReportChoice(reportType);
                    }
                }
                reportBuilder.parseReport(lineReader, line, genome);

            }
            lineReader.close();
            reader.close();
        }
    }

    for (String sampleName : reports.keySet()) {
        System.err.println("sample:" + sampleName);
        ResultReport report = reports.get(sampleName);
        report.write(fs, sampleName);
    }

    fs.close();
}

From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.RegionResultReport.java

License:Open Source License

@Override
public void parseReport(LineReader lineReader, Text line, ReferenceShare genome) throws IOException {
    super.parseReport(lineReader, line, genome);
    String lineString = line.toString();
    if (lineString.contains("Target Information")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            regionReport.parse(line.toString());
        }/*from  w  ww  . j  av a2 s. c  o m*/
    }
    if (lineString.startsWith("bed single Region Statistic")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            bedSingleRegionReport.parseReducerOutput(line.toString(), false);
        }
    }
    if (lineString.startsWith("bed part single Region Statistic")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            bedSingleRegionReport.parseReducerOutput(line.toString(), true);
        }
    }
    if (lineString.startsWith("gender single Region Statistic")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            genderSingleRegionReport.parseReducerOutput(line.toString(), false);
        }
    }
    if (lineString.startsWith("gender part single Region Statistic")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            genderSingleRegionReport.parseReducerOutput(line.toString(), true);
        }
    }
    if (lineString.startsWith("CNV Depth")) {
        while (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            if (line.toString().contains("CNV Depth")) {
                break;
            }
            cnvDepthReport.add(line.toString());
        }
    }
}

From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.ResultReport.java

License:Open Source License

public void parseReport(LineReader lineReader, Text line, ReferenceShare genome) throws IOException {
    String lineString = line.toString();
    String chrName = "";
    if (lineString.contains("chrName:")) {
        String[] sampleSplit = line.toString().split(":");
        chrName = sampleSplit[1];//from  w  ww .j  a v a  2  s .com
    }

    if (lineString.contains("Basic Information")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            basicReport.parse(line.toString());
        }
    }

    if (lineString.startsWith("cnv part single Region Statistic")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            cnvSingleRegionReport.parseReducerOutput(line.toString(), true);
        }
    }
    if (lineString.startsWith("cnv single Region Statistic")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            cnvSingleRegionReport.parseReducerOutput(line.toString(), false);
        }
    }
    if (lineString.startsWith("Region Depth")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            regionCoverReport.parseReducerOutput(line.toString());
        }
    }
    if (lineString.startsWith("RMDUP Region Depth")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            rmdupRegionCoverReport.parseReducerOutput(line.toString());
        }
    }

    if (lineString.contains("insert size information")) {
        fillInsertSize(lineReader, line, insertSize);
    }

    if (lineString.contains("insert size without dup information")) {
        fillInsertSize(lineReader, line, insertSizeWithoutDup);
    }

    if (lineString.contains("unmapped site information") && options.isOutputUnmapped()) {
        String[] splitArray = null;
        ArrayList<Long> unmappedSites = unmappedReport.getUnmappedSites(chrName);
        while (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            if (line.toString().contains("unmapped site information")) {
                break;
            }
            splitArray = line.toString().split("\t");

            unmappedSites.add(Long.parseLong(splitArray[0]));
            unmappedSites.add(Long.parseLong(splitArray[1]));
        }
    }
}

From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.ResultReport.java

License:Open Source License

private void fillInsertSize(LineReader lineReader, Text line, int[] insertSize)
        throws RuntimeException, IOException {
    String[] splitArray = null;// w  w  w. j a va 2 s.  c  o  m
    while (lineReader.readLine(line) > 0 && line.getLength() != 0) {
        if (line.toString().contains("insert size")) {
            break;
        }
        splitArray = line.toString().split("\t");
        int index = Integer.parseInt(splitArray[0]);
        insertSize[index] += Integer.parseInt(splitArray[1]);
    }
}

From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.WholeGenomeResultReport.java

License:Open Source License

@Override
public void parseReport(LineReader lineReader, Text line, ReferenceShare genome) throws IOException {
    super.parseReport(lineReader, line, genome);
    String lineString = line.toString();
    if (lineString.contains("Cover Information")) {
        if (lineReader.readLine(line) > 0 && line.getLength() != 0) {
            String[] splitArray = line.toString().split("\t");
            WholeGenomeCoverReport coverReport = null;
            for (String keyValue : splitArray) {
                if (keyValue.split(" ").length == 1) {
                    String chrName = keyValue;
                    if (!coverReports.containsKey(chrName)) {
                        ChromosomeInformationShare chrInfo = genome.getChromosomeInfo(chrName);
                        coverReport = new WholeGenomeCoverReport(chrInfo);
                        coverReports.put(chrName, coverReport);
                    } else {
                        coverReport = coverReports.get(chrName);
                    }/*from   w  w w .j ava2 s  .  c om*/
                } else {
                    coverReport.parse(keyValue, genome);
                }
            }
        }
    }
}

From source file:org.cloudata.core.common.util.CloudataLineReader.java

License:Apache License

/**
 * Read from the InputStream into the given Text.
 * //from w w  w.  j a  va  2 s. c  om
 * @param str
 *          the object to store the given line
 * @return the number of bytes read including the newline
 * @throws IOException
 *           if the underlying stream throws
 */
public int readLine(Text str) throws IOException {
    str.clear();
    boolean hadFinalNewline = false;
    boolean hadFinalReturn = false;
    boolean hitEndOfFile = false;
    int startPosn = bufferPosn;
    outerLoop: while (true) {
        if (bufferPosn >= bufferLength) {
            if (!backfill()) {
                hitEndOfFile = true;
                break;
            }
        }
        startPosn = bufferPosn;
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            switch (buffer[bufferPosn]) {
            case '\n':
                hadFinalNewline = true;
                bufferPosn += 1;
                break outerLoop;
            case '\r':
                if (hadFinalReturn) {
                    // leave this \n in the stream, so we'll get it next time
                    break outerLoop;
                }
                hadFinalReturn = true;
                break;
            default:
                if (hadFinalReturn) {
                    break outerLoop;
                }
            }
        }
        int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0);
        if (length >= 0) {
            str.append(buffer, startPosn, length);
        }
    }
    int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 : 0);
    if (!hitEndOfFile) {
        int length = bufferPosn - startPosn - newlineLength;
        if (length > 0) {
            str.append(buffer, startPosn, length);
        }
    }
    return str.getLength() + newlineLength;
}

From source file:org.cloudata.examples.web.DocFreqReduce.java

License:Apache License

public void reduce(WritableComparable key, Iterator<Writable> values,
        OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException {
    if (exception != null) {
        throw exception;
    }// w  w  w. j  ava2s  .  c o  m
    Text tKey = (Text) key;
    Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength());

    int docFreq = 0;
    while (values.hasNext()) {
        docFreq++;
    }

    Row row = new Row(rowKey);
    try {
        row.addCell("df", new Cell(Cell.Key.EMPTY_KEY, Long.toString(docFreq).getBytes()));
        termTable.put(row);
    } catch (Exception e) {
        LOG.error(e.getMessage(), e);
    }
}