List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:gov.jgi.meta.hadoop.input.FastqLineReader.java
License:Open Source License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated * line./* w w w. j a va2 s . c o m*/ * * @param str the object to store the given line (without newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * @return the number of bytes read including the (longest) newline * found. * @throws java.io.IOException if the underlying stream throws */ public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false; int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fastq record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@" /* find the next record start */ eof = false; int numOfNewlines = 0;//Added by lanhin do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } //Modefied by lanhin if (buffer[bufferPosn] == CR || buffer[bufferPosn] == LF) { numOfNewlines++; } if ((numOfNewlines >= 4) && buffer[bufferPosn] == '@') { bufferPosn++; break; } bufferPosn++; } while (true);//buffer[bufferPosn++] != '@'); // only read one record at a time //Modefied by lanhin end if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ int i = 1; // skip initial record seperator "@" int j = 1; do { key.clear(); str.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } key.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } str.append(recordBlock.getBytes(), i, j - i - 1); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength()) { // && recordBlock.charAt(j) != '@') { // Modified by lanhin /* Should go straight to the end of recordBlock, ignore all the left info. --lanhin*/ j++; } j++; // skip the "@" } while (j < recordBlock.getLength()); return totalBytesRead; }
From source file:gov.jgi.meta.pig.storage.FastaStorage.java
License:Open Source License
/** * returns the next sequence from the block *//*ww w. j a va 2 s. c o m*/ @Override public Tuple getNext() throws IOException { if (mProtoTuple == null) { mProtoTuple = new ArrayList<Object>(); } try { boolean notDone = in.nextKeyValue(); if (!notDone) { return (null); } /* check the id of the sequence to see if its a paired read */ String seqid = (in.getCurrentKey()).toString(); String seqkey = null; String seqkey2; String header = ""; String direction; for (int i = 0; i < seqid.length(); i++) { if (seqid.charAt(i) == ' ' || seqid.charAt(i) == '\t') { seqkey = seqid.substring(0, i); header = seqid.substring(i, seqid.length()); break; } } if (seqkey == null) seqkey = seqid; if (seqkey.indexOf("/") >= 0) { String[] a = seqkey.split("/"); seqkey2 = a[0]; direction = a[1]; } else { seqkey2 = seqkey; direction = "0"; } Text value = ((Text) in.getCurrentValue()); mProtoTuple.add(new DataByteArray(seqkey2.getBytes(), 0, seqkey2.length())); // add key mProtoTuple.add(new DataByteArray(direction.getBytes(), 0, direction.length())); // add direction mProtoTuple.add(new DataByteArray(value.getBytes(), 0, value.getLength())); // add sequence mProtoTuple.add(new DataByteArray(header.getBytes(), 0, header.length())); // add header Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); mProtoTuple = null; return (t); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } }
From source file:gov.jgi.meta.sequence.SequenceString.java
License:Open Source License
public static String byteArrayToSequence(Text seq) { init();//from w ww .j a va 2 s . c o m StringBuffer sb = new StringBuffer(); byte[] ba = seq.getBytes(); for (int i = 0; i < seq.getLength(); i++) { sb.append(reverseHash.get(ba[i])); } return sb.toString(); }
From source file:hivemall.fm.FFMPredictUDF.java
License:Apache License
@Override public Object evaluate(DeferredObject[] args) throws HiveException { String modelId = _modelIdOI.getPrimitiveJavaObject(args[0].get()); if (modelId == null) { throw new HiveException("modelId is not set"); }/*w w w . j a v a2 s. c o m*/ final FFMPredictionModel model; if (modelId.equals(_cachedModeId)) { model = this._cachedModel; } else { Text serModel = _modelOI.getPrimitiveWritableObject(args[1].get()); if (serModel == null) { throw new HiveException("Model is null for model ID: " + modelId); } byte[] b = serModel.getBytes(); final int length = serModel.getLength(); try { model = FFMPredictionModel.deserialize(b, length); b = null; } catch (ClassNotFoundException e) { throw new HiveException(e); } catch (IOException e) { throw new HiveException(e); } this._cachedModeId = modelId; this._cachedModel = model; } int numFeatures = model.getNumFeatures(); int numFields = model.getNumFields(); Object arg2 = args[2].get(); // [workaround] // java.lang.ClassCastException: org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryArray // cannot be cast to [Ljava.lang.Object; if (arg2 instanceof LazyBinaryArray) { arg2 = ((LazyBinaryArray) arg2).getList(); } Feature[] x = Feature.parseFFMFeatures(arg2, _featureListOI, _probes, numFeatures, numFields); if (x == null || x.length == 0) { return null; // return NULL if there are no features } this._probes = x; double predicted = predict(x, model); _result.set(predicted); return _result; }
From source file:hivemall.mix.MixMessageEncoder.java
License:Open Source License
private static void encodeObject(final Object obj, final ByteBuf buf) throws IOException { assert (obj != null); if (obj instanceof Integer) { Integer i = (Integer) obj; buf.writeByte(INTEGER_TYPE);//from ww w . j a v a2 s . co m buf.writeInt(i.intValue()); } else if (obj instanceof Text) { Text t = (Text) obj; byte[] b = t.getBytes(); int length = t.getLength(); buf.writeByte(TEXT_TYPE); buf.writeInt(length); buf.writeBytes(b, 0, length); } else if (obj instanceof String) { String s = (String) obj; buf.writeByte(STRING_TYPE); writeString(s, buf); } else if (obj instanceof IntWritable) { IntWritable i = (IntWritable) obj; buf.writeByte(INT_WRITABLE_TYPE); buf.writeInt(i.get()); } else if (obj instanceof LongWritable) { LongWritable l = (LongWritable) obj; buf.writeByte(LONG_WRITABLE_TYPE); buf.writeLong(l.get()); } else { throw new IllegalStateException("Unexpected type: " + obj.getClass().getName()); } }
From source file:hivemall.sketch.bloom.BloomFilterUtils.java
License:Apache License
@Nonnull public static <F extends Filter> F deserialize(@Nonnull final Text in, @Nonnull final F dst) throws IOException { return deserialize(in.getBytes(), 0, in.getLength(), dst); }
From source file:hivemall.tools.compress.DeflateUDF.java
License:Apache License
@Override public BytesWritable evaluate(DeferredObject[] arguments) throws HiveException { if (codec == null) { this.codec = new DeflateCodec(true, false); }/*from www .java2 s .com*/ Object arg0 = arguments[0].get(); if (arg0 == null) { return null; } Text text = stringOI.getPrimitiveWritableObject(arg0); byte[] original = text.getBytes(); final int len = text.getLength(); final byte[] compressed; try { compressed = codec.compress(original, 0, len, compressionLevel); } catch (IOException e) { throw new HiveException("Failed to compress", e); } original = null; if (result == null) { this.result = new BytesWritable(compressed); } else { result.set(compressed, 0, compressed.length); } return result; }
From source file:hivemall.tools.text.Unbase91UDF.java
License:Apache License
@Override public BytesWritable evaluate(DeferredObject[] arguments) throws HiveException { if (outputBuf == null) { this.outputBuf = new FastByteArrayOutputStream(4096); } else {// w w w .ja v a 2s . c om outputBuf.reset(); } Object arg0 = arguments[0].get(); if (arg0 == null) { return null; } Text input = stringOI.getPrimitiveWritableObject(arg0); final byte[] inputBytes = input.getBytes(); final int len = input.getLength(); try { Base91.decode(inputBytes, 0, len, outputBuf); } catch (IOException e) { throw new HiveException(e); } if (result == null) { byte[] outputBytes = outputBuf.toByteArray(); this.result = new BytesWritable(outputBytes); } else { byte[] outputBytes = outputBuf.getInternalArray(); int outputSize = outputBuf.size(); result.set(outputBytes, 0, outputSize); } return result; }
From source file:io.aos.hdfs.StringTextComparisonTest.java
License:Apache License
@Test public void text() { Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); assertThat(t.getLength(), is(10)); assertThat(t.find("\u0041"), is(0)); assertThat(t.find("\u00DF"), is(1)); assertThat(t.find("\u6771"), is(3)); assertThat(t.find("\uD801\uDC00"), is(6)); assertThat(t.charAt(0), is(0x0041)); assertThat(t.charAt(1), is(0x00DF)); assertThat(t.charAt(3), is(0x6771)); assertThat(t.charAt(6), is(0x10400)); }
From source file:io.aos.hdfs.TextIterator.java
License:Apache License
public static void main(String... args) { Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength()); int cp;/*w w w. ja v a2 s . c o m*/ while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) { System.out.println(Integer.toHexString(cp)); } }