List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:eu.larkc.RDFPig.io.NTriplesReader.java
License:Apache License
@Override public Tuple getNext() throws IOException { while (true) { mProtoTuple = null;//from ww w . ja v a 2s . co m try { boolean notDone = in.nextKeyValue(); if (!notDone) { return null; } Text value = null; try { value = (Text) in.getCurrentValue(); byte[] buf = value.getBytes(); int len = value.getLength(); if (len < 3) continue; // Ignore lines with less than 3 bytes //Get rid of any trailing whitespace while (Character.isWhitespace(buf[len - 1])) len--; if (buf[len - 1] != '.') continue;//throw new ExecException("Could not parse triple, no trailing \'.\': " + value); else len--; //Get rid of any trailing whitespace while (Character.isWhitespace(buf[len - 1])) len--; int start = 0; while (Character.isWhitespace(buf[start])) start++; // Parse subject boolean isURI = buf[0] == '<'; for (int i = 0; i < len; i++) { if (isURI && buf[i] == '>') { readField(buf, start, i + 1); start = i + 1; break; } else if (Character.isWhitespace(buf[i])) { readField(buf, start, i); start = i + 1; break; } } while (Character.isWhitespace(buf[start])) start++; // Parse predicate (always URI) for (int i = start; i < len; i++) { if (buf[i] == '>') { readField(buf, start, i + 1); start = i + 1; break; } } while (Character.isWhitespace(buf[start])) start++; // Parse object if (buf[start] == '<') //URI for (int i = start + 1; i < len; i++) { if (buf[i] == '>') { readField(buf, start, i + 1); start = i + 1; break; } } else if (buf[start] == '"') //Literal for (int i = start + 1; i < len; i++) { if (buf[i] == '"' && i > 0 && buf[i - 1] != '\\') { readField(buf, start, i + 1); start = i + 1; break; } } else if (buf[start] == '_') {//BNode int i = start + 1; for (; i < len; i++) { if (Character.isWhitespace(buf[i])) { readField(buf, start, i); start = i + 1; break; } } // We are at end of line, read it readField(buf, start, i); } else continue;//throw new ExecException("Could not parse triple, invalid term in object position: " + value); // After the first three terms, the rest are ignored if (mProtoTuple.size() != 3) continue; Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); mProtoTuple = null; return t; } catch (Exception e) { e.printStackTrace(); System.err.println("For line: " + value); mProtoTuple = null; } } catch (Exception e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } }
From source file:example.TestLineRecordReader.java
License:Apache License
@Test public void testUncompressedInputCustomDelimiterPosValue() throws Exception { Configuration conf = new Configuration(); conf.setInt("io.file.buffer.size", 10); conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); String inputData = "abcdefghij++kl++mno"; Path inputFile = createInputFile(conf, inputData); String delimiter = "++"; byte[] recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8); int splitLength = 15; FileSplit split = new FileSplit(inputFile, 0, splitLength, (String[]) null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); LineRecordReader reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context);// w w w . ja v a 2 s .c om // Get first record: "abcdefghij" assertTrue("Expected record got nothing", reader.nextKeyValue()); LongWritable key = reader.getCurrentKey(); Text value = reader.getCurrentValue(); assertEquals("Wrong length for record value", 10, value.getLength()); assertEquals("Wrong position after record read", 0, key.get()); // Get second record: "kl" assertTrue("Expected record got nothing", reader.nextKeyValue()); assertEquals("Wrong length for record value", 2, value.getLength()); // Key should be 12 right after "abcdefghij++" assertEquals("Wrong position after record read", 12, key.get()); // Get third record: "mno" assertTrue("Expected record got nothing", reader.nextKeyValue()); assertEquals("Wrong length for record value", 3, value.getLength()); // Key should be 16 right after "abcdefghij++kl++" assertEquals("Wrong position after record read", 16, key.get()); assertFalse(reader.nextKeyValue()); // Key should be 19 right after "abcdefghij++kl++mno" assertEquals("Wrong position after record read", 19, key.get()); // after refresh should be empty key = reader.getCurrentKey(); assertNull("Unexpected key returned", key); reader.close(); split = new FileSplit(inputFile, splitLength, inputData.length() - splitLength, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); // No record is in the second split because the second split dropped // the first record, which was already reported by the first split. assertFalse("Unexpected record returned", reader.nextKeyValue()); key = reader.getCurrentKey(); assertNull("Unexpected key returned", key); reader.close(); // multi char delimiter with starting part of the delimiter in the data inputData = "abcd+efgh++ijk++mno"; inputFile = createInputFile(conf, inputData); splitLength = 5; split = new FileSplit(inputFile, 0, splitLength, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); // Get first record: "abcd+efgh" assertTrue("Expected record got nothing", reader.nextKeyValue()); key = reader.getCurrentKey(); value = reader.getCurrentValue(); assertEquals("Wrong position after record read", 0, key.get()); assertEquals("Wrong length for record value", 9, value.getLength()); // should have jumped over the delimiter, no record assertFalse(reader.nextKeyValue()); assertEquals("Wrong position after record read", 11, key.get()); // after refresh should be empty key = reader.getCurrentKey(); assertNull("Unexpected key returned", key); reader.close(); // next split: check for duplicate or dropped records split = new FileSplit(inputFile, splitLength, inputData.length() - splitLength, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); assertTrue("Expected record got nothing", reader.nextKeyValue()); key = reader.getCurrentKey(); value = reader.getCurrentValue(); // Get second record: "ijk" first in this split assertEquals("Wrong position after record read", 11, key.get()); assertEquals("Wrong length for record value", 3, value.getLength()); // Get third record: "mno" second in this split assertTrue("Expected record got nothing", reader.nextKeyValue()); assertEquals("Wrong position after record read", 16, key.get()); assertEquals("Wrong length for record value", 3, value.getLength()); // should be at the end of the input assertFalse(reader.nextKeyValue()); assertEquals("Wrong position after record read", 19, key.get()); reader.close(); inputData = "abcd|efgh|+|ij|kl|+|mno|pqr"; inputFile = createInputFile(conf, inputData); delimiter = "|+|"; recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8); // walking over the buffer and split sizes checks for proper processing // of the ambiguous bytes of the delimiter for (int bufferSize = 1; bufferSize <= inputData.length(); bufferSize++) { for (int splitSize = 1; splitSize < inputData.length(); splitSize++) { // track where we are in the inputdata int keyPosition = 0; conf.setInt("io.file.buffer.size", bufferSize); split = new FileSplit(inputFile, 0, bufferSize, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); // Get the first record: "abcd|efgh" always possible assertTrue("Expected record got nothing", reader.nextKeyValue()); key = reader.getCurrentKey(); value = reader.getCurrentValue(); assertTrue("abcd|efgh".equals(value.toString())); // Position should be 0 right at the start assertEquals("Wrong position after record read", keyPosition, key.get()); // Position should be 12 right after the first "|+|" keyPosition = 12; // get the next record: "ij|kl" if the split/buffer allows it if (reader.nextKeyValue()) { // check the record info: "ij|kl" assertTrue("ij|kl".equals(value.toString())); assertEquals("Wrong position after record read", keyPosition, key.get()); // Position should be 20 after the second "|+|" keyPosition = 20; } // get the third record: "mno|pqr" if the split/buffer allows it if (reader.nextKeyValue()) { // check the record info: "mno|pqr" assertTrue("mno|pqr".equals(value.toString())); assertEquals("Wrong position after record read", keyPosition, key.get()); // Position should be the end of the input keyPosition = inputData.length(); } assertFalse("Unexpected record returned", reader.nextKeyValue()); // no more records can be read we should be at the last position assertEquals("Wrong position after record read", keyPosition, key.get()); // after refresh should be empty key = reader.getCurrentKey(); assertNull("Unexpected key returned", key); reader.close(); } } }
From source file:example.TestLineRecordReader.java
License:Apache License
@Test public void testUncompressedInputDefaultDelimiterPosValue() throws Exception { Configuration conf = new Configuration(); String inputData = "1234567890\r\n12\r\n345"; Path inputFile = createInputFile(conf, inputData); conf.setInt("io.file.buffer.size", 10); conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); FileSplit split = new FileSplit(inputFile, 0, 15, (String[]) null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); LineRecordReader reader = new LineRecordReader(null); reader.initialize(split, context);/*from w ww .j av a 2s . c o m*/ LongWritable key; Text value; reader.nextKeyValue(); key = reader.getCurrentKey(); value = reader.getCurrentValue(); // Get first record:"1234567890" assertEquals(10, value.getLength()); assertEquals(0, key.get()); reader.nextKeyValue(); // Get second record:"12" assertEquals(2, value.getLength()); // Key should be 12 right after "1234567890\r\n" assertEquals(12, key.get()); assertFalse(reader.nextKeyValue()); // Key should be 16 right after "1234567890\r\n12\r\n" assertEquals(16, key.get()); split = new FileSplit(inputFile, 15, 4, (String[]) null); reader = new LineRecordReader(null); reader.initialize(split, context); // The second split dropped the first record "\n" reader.nextKeyValue(); key = reader.getCurrentKey(); value = reader.getCurrentValue(); // Get third record:"345" assertEquals(3, value.getLength()); // Key should be 16 right after "1234567890\r\n12\r\n" assertEquals(16, key.get()); assertFalse(reader.nextKeyValue()); // Key should be 19 right after "1234567890\r\n12\r\n345" assertEquals(19, key.get()); inputData = "123456789\r\r\n"; inputFile = createInputFile(conf, inputData); split = new FileSplit(inputFile, 0, 12, (String[]) null); reader = new LineRecordReader(null); reader.initialize(split, context); reader.nextKeyValue(); key = reader.getCurrentKey(); value = reader.getCurrentValue(); // Get first record:"123456789" assertEquals(9, value.getLength()); assertEquals(0, key.get()); reader.nextKeyValue(); // Get second record:"" assertEquals(0, value.getLength()); // Key should be 10 right after "123456789\r" assertEquals(10, key.get()); assertFalse(reader.nextKeyValue()); // Key should be 12 right after "123456789\r\r\n" assertEquals(12, key.get()); }
From source file:fi.tkk.ics.hadoop.bam.SequencedFragment.java
License:Open Source License
/** * Convert quality scores in-place.//from w ww .ja v a 2s . co m * * @raise FormatException if quality scores are out of the range * allowed by the current encoding. * @raise IllegalArgumentException if current and target quality encodings are the same. */ public static void convertQuality(Text quality, BaseQualityEncoding current, BaseQualityEncoding target) { if (current == target) throw new IllegalArgumentException( "current and target quality encodinds are the same (" + current + ")"); byte[] bytes = quality.getBytes(); final int len = quality.getLength(); final int illuminaSangerDistance = FormatConstants.ILLUMINA_OFFSET - FormatConstants.SANGER_OFFSET; if (current == BaseQualityEncoding.Illumina && target == BaseQualityEncoding.Sanger) { for (int i = 0; i < len; ++i) { if (bytes[i] < FormatConstants.ILLUMINA_OFFSET || bytes[i] > (FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX)) { throw new FormatException("base quality score out of range for Illumina Phred+64 format (found " + (bytes[i] - FormatConstants.ILLUMINA_OFFSET) + " but acceptable range is [0," + FormatConstants.ILLUMINA_MAX + "]).\n" + "Maybe qualities are encoded in Sanger format?\n"); } bytes[i] -= illuminaSangerDistance; } } else if (current == BaseQualityEncoding.Sanger && target == BaseQualityEncoding.Illumina) { for (int i = 0; i < len; ++i) { if (bytes[i] < FormatConstants.SANGER_OFFSET || bytes[i] > (FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX)) { throw new FormatException("base quality score out of range for Sanger Phred+64 format (found " + (bytes[i] - FormatConstants.SANGER_OFFSET) + " but acceptable range is [0," + FormatConstants.SANGER_MAX + "]).\n" + "Maybe qualities are encoded in Illumina format?\n"); } bytes[i] += illuminaSangerDistance; } } else throw new IllegalArgumentException( "unsupported BaseQualityEncoding transformation from " + current + " to " + target); }
From source file:fi.tkk.ics.hadoop.bam.SequencedFragment.java
License:Open Source License
/** * Verify that the given quality bytes are within the range allowed for the specified encoding. * * In theory, the Sanger encoding uses the entire * range of characters from ASCII 33 to 126, giving a value range of [0,93]. However, values over 60 are * unlikely in practice, and are more likely to be caused by mistaking a file that uses Illumina encoding * for Sanger. So, we'll enforce the same range supported by Illumina encoding ([0,62]) for Sanger. * * @return -1 if quality is ok.// w w w .ja v a 2s . c om * @return If an out-of-range value is found the index of the value is returned. */ public static int verifyQuality(Text quality, BaseQualityEncoding encoding) { // set allowed quality range int max, min; if (encoding == BaseQualityEncoding.Illumina) { max = FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX; min = FormatConstants.ILLUMINA_OFFSET; } else if (encoding == BaseQualityEncoding.Sanger) { max = FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX; min = FormatConstants.SANGER_OFFSET; } else throw new IllegalArgumentException("Unsupported base encoding quality " + encoding); // verify final byte[] bytes = quality.getBytes(); final int len = quality.getLength(); for (int i = 0; i < len; ++i) { if (bytes[i] < min || bytes[i] > max) return i; } return -1; }
From source file:fm.last.darling.hbase.HBaseJSONOutputReader.java
License:Apache License
private byte[] trimOuterBytes(Text text) { byte[] bytes = new byte[text.getLength() - 2]; System.arraycopy(text.getBytes(), 1, bytes, 0, bytes.length); return bytes; }
From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.ExpressionRecordWriter.java
License:LGPL
@Override public synchronized void write(final Text key, final LongWritable value) throws IOException, InterruptedException { this.context.getCounter(COUNTERS_GROUP, INPUT_ENTRIES).increment(1); if (value == null) { return;//www .j a v a 2 s. c o m } this.out.write(key.getBytes(), 0, key.getLength()); this.out.write(separator); this.out.write(value.toString().getBytes(StandardCharsets.UTF_8)); this.out.write(newline); this.context.getCounter(COUNTERS_GROUP, ENTRIES_WRITTEN).increment(1); }
From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMRecordWriter.java
License:LGPL
@Override public synchronized void write(final Text key, final Text value) throws IOException, InterruptedException { this.context.getCounter(COUNTERS_GROUP, INPUT_ENTRIES).increment(1); if (value == null) { return;//from w w w . j a v a 2 s. c om } this.out.write(value.getBytes(), 0, value.getLength()); this.out.write(newline); this.context.getCounter(COUNTERS_GROUP, ENTRIES_WRITTEN).increment(1); }
From source file:gov.jgi.meta.hadoop.input.FastaBlockLineReader.java
License:Open Source License
public int readLine(Text key, Map<String, String> set, int maxLineLength, long maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false;/*from w w w. j a v a2s . c o m*/ int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '>'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fasta record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">" /* find the next record start */ eof = false; do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '>' || (totalBytesRead + bufferPosn - startPosn) <= maxBytesToConsume); if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ Text k = new Text(); Text s = new Text(); int i = 1; // skip initial record seperator ">" int j = 1; do { k.clear(); s.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } k.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } s.append(recordBlock.getBytes(), i, j - i - 1); set.put(k.toString(), s.toString().toLowerCase()); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>') { j++; } j++; // skip the ">" } while (j < recordBlock.getLength()); return totalBytesRead; }
From source file:gov.jgi.meta.hadoop.input.FastqBlockLineReader.java
License:Open Source License
public int readLine(Text key, Map<String, String> set, int maxLineLength, int maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false;/*from w ww.java 2s . co m*/ int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fasta record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@" /* find the next record start */ eof = false; do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@' || (totalBytesRead + bufferPosn - startPosn) < maxBytesToConsume); if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ Text k = new Text(); Text s = new Text(); int i = 1; // skip initial record seperator ">" int j = 1; do { k.clear(); s.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } k.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } s.append(recordBlock.getBytes(), i, j - i - 1); set.put(k.toString(), s.toString().toLowerCase()); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength() && recordBlock.charAt(j) != '@') { j++; } j++; // skip the "@" } while (j < recordBlock.getLength()); return totalBytesRead; }