List of usage examples for org.apache.hadoop.util LineReader readLine
public int readLine(Text str, int maxLineLength) throws IOException
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Position the input stream at the start of the first record. * * @param stream The stream to reposition. *//*from w w w .j av a 2 s. c o m*/ protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException { Text buffer = new Text(); if (true) { // (start > 0) // use start>0 to assume that files start with valid data // Advance to the start of the first record that ends with /1 // We use a temporary LineReader to read lines until we find the // position of the right one. We then seek the file to that position. stream.seek(start); LineReader reader = new LineReader(stream); int bytesRead = 0; do { bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); int bufferLength = buffer.getLength(); if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) { start += bytesRead; } else { // line starts with @. Read two more and verify that it starts with a + // // If this isn't the start of a record, we want to backtrack to its end long backtrackPosition = start + bytesRead; bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { break; // all good! } else { // backtrack to the end of the record we thought was the start. start = backtrackPosition; stream.seek(start); reader = new LineReader(stream); } } } while (bytesRead > 0); stream.seek(start); } pos = start; }
From source file:hadoop.inputsplit.FastaLineRecordReader.java
License:Apache License
private void setKeySeq(FileSystem fs, Configuration job) { //Set currentKey if (Constant.SPLIT2_DEBUG_MODE) currentKey = new Text(file.getName() + "." + start); else {//from www .j a v a2 s.co m try { LineReader reader = new LineReader(fs.open(file), job, recordDelimiterBytes); currentKey = new Text(); reader.readLine(currentKey, maxLineLength); reader.close(); currentKey.set(currentKey.toString().replaceAll(">", "")); } catch (Exception e) { LOG.error(e.getMessage()); currentKey = new Text(file.getName()); } } }
From source file:org.apache.carbondata.processing.csvreaderstep.UnivocityCsvParser.java
License:Apache License
/** * Below method will be used to initialize the reader * * @throws IOException/*from w w w .j a v a 2 s.c o m*/ */ private void initializeReader() throws IOException { // if already one input stream is open first we need to close and then // open new stream close(); // get the block offset long startOffset = this.csvParserVo.getBlockDetailsList().get(blockCounter).getBlockOffset(); FileType fileType = FileFactory .getFileType(this.csvParserVo.getBlockDetailsList().get(blockCounter).getFilePath()); // calculate the end offset the block long endOffset = this.csvParserVo.getBlockDetailsList().get(blockCounter).getBlockLength() + startOffset; // create a input stream for the block DataInputStream dataInputStream = FileFactory.getDataInputStream( this.csvParserVo.getBlockDetailsList().get(blockCounter).getFilePath(), fileType, bufferSize, startOffset); // if start offset is not 0 then reading then reading and ignoring the extra line if (startOffset != 0) { LineReader lineReader = new LineReader(dataInputStream, 1); startOffset += lineReader.readLine(new Text(), 0); } inputStreamReader = new BufferedReader( new InputStreamReader(new CustomDataStream(dataInputStream, endOffset - startOffset))); }
From source file:org.apache.sqoop.connector.hdfs.HdfsExtractor.java
License:Apache License
/** * Extracts Text file/*from w ww.j a v a2 s . c o m*/ * @param file * @param start * @param length * @throws IOException */ @SuppressWarnings("resource") private void extractTextFile(Path file, long start, long length) throws IOException { LOG.info("Extracting text file"); long end = start + length; FileSystem fs = file.getFileSystem(conf); FSDataInputStream filestream = fs.open(file); CompressionCodec codec = (new CompressionCodecFactory(conf)).getCodec(file); LineReader filereader; Seekable fileseeker = filestream; // Hadoop 1.0 does not have support for custom record delimiter and thus // we // are supporting only default one. // We might add another "else if" case for SplittableCompressionCodec once // we drop support for Hadoop 1.0. if (codec == null) { filestream.seek(start); filereader = new LineReader(filestream); } else { filereader = new LineReader(codec.createInputStream(filestream, codec.createDecompressor()), conf); fileseeker = filestream; } if (start != 0) { // always throw away first record because // one extra line is read in previous split start += filereader.readLine(new Text(), 0); } int size; LOG.info("Start position: " + String.valueOf(start)); long next = start; while (next <= end) { Text line = new Text(); size = filereader.readLine(line, Integer.MAX_VALUE); if (size == 0) { break; } if (codec == null) { next += size; } else { next = fileseeker.getPos(); } rowRead++; dataWriter.writeStringRecord(line.toString()); } LOG.info("Extracting ended on position: " + fileseeker.getPos()); filestream.close(); }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Position the input stream at the start of the first record. * * @param stream The stream to reposition. *//* ww w.j av a 2 s. co m*/ protected final int positionAtFirstRecord(final FSDataInputStream stream, final CompressionCodec codec) throws IOException { Text buffer = new Text(); long originalStart = start; LineReader reader; if (codec == null) { // Advance to the start of the first record that ends with /1 // We use a temporary LineReader to read lines until we find the // position of the right one. We then seek the file to that position. stream.seek(start); reader = new LineReader(stream); } else { // Unlike the codec == null case, we don't seek before creating the // reader, SplittableCompressionCodec.createInputStream places the // stream at the start of the first compression block after our // split start // // as noted above, we need to be at pos 0 in the stream before // calling this reader = new LineReader(((SplittableCompressionCodec) codec).createInputStream(stream, null, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK)); } int bytesRead = 0; do { bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); int bufferLength = buffer.getLength(); if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) { start += bytesRead; } else { // line starts with @. Read two more and verify that it starts // with a +: // // @<readname> // <sequence> // +[readname] // // if the second line we read starts with a @, we know that // we've read: // // <qualities> <-- @ is a valid ASCII phred encoding // @<readname> // // and thus, the second read is the delimiter and we can break long trackForwardPosition = start + bytesRead; bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); if (buffer.getLength() > 0 && buffer.getBytes()[0] == '@') { start = trackForwardPosition; break; } else { trackForwardPosition += bytesRead; } bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); trackForwardPosition += bytesRead; if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { break; // all good! } else { start = trackForwardPosition; } } } while (bytesRead > 0); pos = start; start = originalStart; stream.seek(start); return (int) (pos - originalStart); }