List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Reads a newline into a text record from the underlying line reader. * * @param dest Text record to read line into. * @param eofOk Whether an EOF is acceptable in this line. * @return Returns the number of bytes read. * * @throws EOFException Throws if eofOk was false and we hit an EOF in * the current line.//w w w . ja v a 2 s. c om */ private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException { Text buf = new Text(); int bytesRead = lineReader.readLine(buf, (int) Math.min(maxLineLength, end - start)); // ok, so first, split/unsplit, compressed/uncompressed notwithstanding, // there are three cases we can run into: // // 1. we read data // 2. we are at an acceptable eof/end-of-split and don't read data // 3. we are at an unacceptable eof/end-of-split and don't read data // // cases 1 and 2 are consistent across split/unsplit, compressed/uncompressed. // // case 3 is simple in the unsplit or uncompressed cases; something has // gone wrong, we throw an EOFException, and move on with our lives // // case 3 is where working with split compressed files gets fun. // // with the split compression stream, the first time we read past the // end of the last compression block within a file split, we get no // bytes back. the BZip2Codec and BGZFCodec's actually tell us that // we'll get -2 back in this case, but we'll cast a wider net yet. // // this is important information---if we don't know this, we'll keep reading // past the end of the split to the end of the file---but we still need to // finish reading our multiline record, so we set some state to let us know // that we're reading the last record in the split (endOfCompressedSplit) // and repeat the read. if the read fails again, then that means that // something has actually gone wrong, and we want to fall through and // throw an EOFException or return no bytes read (depending on eofOk). // that's why we have the lastReadWasZeroBytes flag around. we set this // to true on the first read that gets bytesRead <= 0, and clear it on // any read that reads more than 0 bytes. if (isSplittable && isCompressed && !lastReadWasZeroBytes && bytesRead <= 0 && !eofOk) { // we need to clear the reader state so we can continue reading ((ResettableCompressedSplitLineReader) lineReader).reset(); // set the state to stop us from reading another record and // to catch back-to-back failed reads lastReadWasZeroBytes = true; endOfCompressedSplit = true; // recursively call to redo the read return appendLineInto(dest, eofOk); } else if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) { throw new EOFException(); } else { lastReadWasZeroBytes = false; } dest.append(buf.getBytes(), 0, buf.getLength()); dest.append(newline, 0, 1); if (isSplittable && isCompressed) { pos = ((SplitCompressionInputStream) inputStream).getPos(); } else { pos += bytesRead; } return bytesRead; }
From source file:org.bgi.flexlab.gaea.data.mapreduce.input.cram.ChromosomeIndex.java
License:Open Source License
public void loadChromosomeIndex() { chromosomeIndexMap = new HashMap<Integer, ChromosomeIndexStructure>(); Path p = new Path(cramIndexFileName); Configuration conf = new Configuration(); FileSystem fs = null;// ww w .j a va 2 s .com try { fs = p.getFileSystem(conf); } catch (IOException e1) { e1.printStackTrace(); } try { if (!fs.exists(p)) { writeIndex(indexForChromosome(new Path(cramFileName))); } } catch (IOException e) { e.printStackTrace(); } try { FSDataInputStream reader = p.getFileSystem(conf).open(p); LineReader lineReader = new LineReader(reader, conf); Text line = new Text(); while (lineReader.readLine(line) > 0) { if (line.getLength() == 0) continue; String[] str = line.toString().split("\t"); chromosomeIndexMap.put(Integer.parseInt(str[0]), new ChromosomeIndexStructure(line.toString())); } lineReader.close(); reader.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.bgi.flexlab.gaea.data.mapreduce.input.vcf.VCFSplit.java
License:Open Source License
private void readFile(Path p) { StringBuilder sb = null;/* ww w .ja va 2s .co m*/ if (!headerHasWrite) { sb = new StringBuilder(); } try { FSDataInputStream table = fs.open(p); LineReader lineReader = new LineReader(table, conf); Text line = new Text(); String tempString = null; while (lineReader.readLine(line) > 0 && line.getLength() != 0) { tempString = line.toString(); if (tempString.startsWith(VCFHeaderStartTag)) { if (headerHasWrite) { continue; } sb.append(tempString.trim()); sb.append("\n"); if (tempString.startsWith(VCFHeaderEndLineTag)) { writeHeader(sb.toString().trim()); headerHasWrite = true; } } else if (tempString.startsWith(SampleTag)) { String sampleName = tempString.split(":")[1]; currentOutput = sample.get(sampleName); } else { write("\n"); if (tempString.startsWith("chr1\t179462149")) { System.out.println("debug:" + tempString); } write(tempString); } } lineReader.close(); table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.BamReport.java
License:Open Source License
public static void getOutput(BamQualityControlOptions options, Configuration conf, Path oPath) throws IOException { ReportBuilder reportBuilder = new ReportBuilder(); ResultReport reportType;/*from w ww . ja va 2 s. co m*/ ReferenceShare genome = new ReferenceShare(); genome.loadChromosomeList(options.getReferenceSequencePath()); if ((options.getRegion() != null) || (options.getBedfile() != null)) reportType = new RegionResultReport(options, conf); else reportType = new WholeGenomeResultReport(options); Map<String, ResultReport> reports = new ConcurrentHashMap<String, ResultReport>(); FileSystem fs = oPath.getFileSystem(conf); FileStatus filelist[] = fs.listStatus(oPath); for (int i = 0; i < filelist.length; i++) { if (!filelist[i].isDir() && !filelist[i].getPath().toString().startsWith("_")) { FSDataInputStream reader = fs.open(filelist[i].getPath()); LineReader lineReader = new LineReader(reader, conf); Text line = new Text(); while (lineReader.readLine(line) > 0) { String lineString = line.toString(); if (line.getLength() == 0) { continue; } if (lineString.contains("sample:")) { String sample = line.toString().split(":")[1]; if (!reports.containsKey(sample)) { reports.put(sample, reportType); reportBuilder.setReportChoice(reportType); reportBuilder.initReports(sample); } else { reportType = reports.get(sample); reportBuilder.setReportChoice(reportType); } } reportBuilder.parseReport(lineReader, line, genome); } lineReader.close(); reader.close(); } } for (String sampleName : reports.keySet()) { System.err.println("sample:" + sampleName); ResultReport report = reports.get(sampleName); report.write(fs, sampleName); } fs.close(); }
From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.RegionResultReport.java
License:Open Source License
@Override public void parseReport(LineReader lineReader, Text line, ReferenceShare genome) throws IOException { super.parseReport(lineReader, line, genome); String lineString = line.toString(); if (lineString.contains("Target Information")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { regionReport.parse(line.toString()); }/*from w ww . j av a2 s. c o m*/ } if (lineString.startsWith("bed single Region Statistic")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { bedSingleRegionReport.parseReducerOutput(line.toString(), false); } } if (lineString.startsWith("bed part single Region Statistic")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { bedSingleRegionReport.parseReducerOutput(line.toString(), true); } } if (lineString.startsWith("gender single Region Statistic")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { genderSingleRegionReport.parseReducerOutput(line.toString(), false); } } if (lineString.startsWith("gender part single Region Statistic")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { genderSingleRegionReport.parseReducerOutput(line.toString(), true); } } if (lineString.startsWith("CNV Depth")) { while (lineReader.readLine(line) > 0 && line.getLength() != 0) { if (line.toString().contains("CNV Depth")) { break; } cnvDepthReport.add(line.toString()); } } }
From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.ResultReport.java
License:Open Source License
public void parseReport(LineReader lineReader, Text line, ReferenceShare genome) throws IOException { String lineString = line.toString(); String chrName = ""; if (lineString.contains("chrName:")) { String[] sampleSplit = line.toString().split(":"); chrName = sampleSplit[1];//from w ww .j a v a 2 s .com } if (lineString.contains("Basic Information")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { basicReport.parse(line.toString()); } } if (lineString.startsWith("cnv part single Region Statistic")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { cnvSingleRegionReport.parseReducerOutput(line.toString(), true); } } if (lineString.startsWith("cnv single Region Statistic")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { cnvSingleRegionReport.parseReducerOutput(line.toString(), false); } } if (lineString.startsWith("Region Depth")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { regionCoverReport.parseReducerOutput(line.toString()); } } if (lineString.startsWith("RMDUP Region Depth")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { rmdupRegionCoverReport.parseReducerOutput(line.toString()); } } if (lineString.contains("insert size information")) { fillInsertSize(lineReader, line, insertSize); } if (lineString.contains("insert size without dup information")) { fillInsertSize(lineReader, line, insertSizeWithoutDup); } if (lineString.contains("unmapped site information") && options.isOutputUnmapped()) { String[] splitArray = null; ArrayList<Long> unmappedSites = unmappedReport.getUnmappedSites(chrName); while (lineReader.readLine(line) > 0 && line.getLength() != 0) { if (line.toString().contains("unmapped site information")) { break; } splitArray = line.toString().split("\t"); unmappedSites.add(Long.parseLong(splitArray[0])); unmappedSites.add(Long.parseLong(splitArray[1])); } } }
From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.ResultReport.java
License:Open Source License
private void fillInsertSize(LineReader lineReader, Text line, int[] insertSize) throws RuntimeException, IOException { String[] splitArray = null;// w w w. j a va 2 s. c o m while (lineReader.readLine(line) > 0 && line.getLength() != 0) { if (line.toString().contains("insert size")) { break; } splitArray = line.toString().split("\t"); int index = Integer.parseInt(splitArray[0]); insertSize[index] += Integer.parseInt(splitArray[1]); } }
From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.WholeGenomeResultReport.java
License:Open Source License
@Override public void parseReport(LineReader lineReader, Text line, ReferenceShare genome) throws IOException { super.parseReport(lineReader, line, genome); String lineString = line.toString(); if (lineString.contains("Cover Information")) { if (lineReader.readLine(line) > 0 && line.getLength() != 0) { String[] splitArray = line.toString().split("\t"); WholeGenomeCoverReport coverReport = null; for (String keyValue : splitArray) { if (keyValue.split(" ").length == 1) { String chrName = keyValue; if (!coverReports.containsKey(chrName)) { ChromosomeInformationShare chrInfo = genome.getChromosomeInfo(chrName); coverReport = new WholeGenomeCoverReport(chrInfo); coverReports.put(chrName, coverReport); } else { coverReport = coverReports.get(chrName); }/*from w w w .j ava2 s . c om*/ } else { coverReport.parse(keyValue, genome); } } } } }
From source file:org.cloudata.core.common.util.CloudataLineReader.java
License:Apache License
/** * Read from the InputStream into the given Text. * //from w w w. j a va 2 s. c om * @param str * the object to store the given line * @return the number of bytes read including the newline * @throws IOException * if the underlying stream throws */ public int readLine(Text str) throws IOException { str.clear(); boolean hadFinalNewline = false; boolean hadFinalReturn = false; boolean hitEndOfFile = false; int startPosn = bufferPosn; outerLoop: while (true) { if (bufferPosn >= bufferLength) { if (!backfill()) { hitEndOfFile = true; break; } } startPosn = bufferPosn; for (; bufferPosn < bufferLength; ++bufferPosn) { switch (buffer[bufferPosn]) { case '\n': hadFinalNewline = true; bufferPosn += 1; break outerLoop; case '\r': if (hadFinalReturn) { // leave this \n in the stream, so we'll get it next time break outerLoop; } hadFinalReturn = true; break; default: if (hadFinalReturn) { break outerLoop; } } } int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0); if (length >= 0) { str.append(buffer, startPosn, length); } } int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 : 0); if (!hitEndOfFile) { int length = bufferPosn - startPosn - newlineLength; if (length > 0) { str.append(buffer, startPosn, length); } } return str.getLength() + newlineLength; }
From source file:org.cloudata.examples.web.DocFreqReduce.java
License:Apache License
public void reduce(WritableComparable key, Iterator<Writable> values, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { if (exception != null) { throw exception; }// w w w. j ava2s . c o m Text tKey = (Text) key; Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength()); int docFreq = 0; while (values.hasNext()) { docFreq++; } Row row = new Row(rowKey); try { row.addCell("df", new Cell(Cell.Key.EMPTY_KEY, Long.toString(docFreq).getBytes())); termTable.put(row); } catch (Exception e) { LOG.error(e.getMessage(), e); } }