Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:gov.jgi.meta.hadoop.input.FastqLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line./*  w w  w.  j  a  va2  s . c o  m*/
 *
 * @param str               the object to store the given line (without newline)
 * @param maxLineLength     the maximum number of bytes to store into str;
 *                          the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *                          in this call.  This is only a hint, because if the line cross
 *                          this threshold, we allow it to happen.  It can overshoot
 *                          potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline
 *         found.
 * @throws java.io.IOException if the underlying stream throws
 */
public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '@');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fastq record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

    /*
    find the next record start
     */
    eof = false;
    int numOfNewlines = 0;//Added by lanhin
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
        //Modefied by lanhin
        if (buffer[bufferPosn] == CR || buffer[bufferPosn] == LF) {
            numOfNewlines++;
        }
        if ((numOfNewlines >= 4) && buffer[bufferPosn] == '@') {
            bufferPosn++;
            break;
        }
        bufferPosn++;
    } while (true);//buffer[bufferPosn++] != '@');  // only read one record at a time
    //Modefied by lanhin end

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    int i = 1; // skip initial record seperator "@"
    int j = 1;
    do {
        key.clear();
        str.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        key.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            str.append(recordBlock.getBytes(), i, j - i - 1);

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength()) {
            // && recordBlock.charAt(j) != '@') {  // Modified by lanhin
            /* Should go straight to the end of recordBlock,
               ignore all the left info.  --lanhin*/

            j++;
        }

        j++; // skip the "@"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.pig.storage.FastaStorage.java

License:Open Source License

/**
 * returns the next sequence from the block
 *//*ww w.  j  a va  2 s. c o  m*/
@Override
public Tuple getNext() throws IOException {

    if (mProtoTuple == null) {
        mProtoTuple = new ArrayList<Object>();
    }

    try {
        boolean notDone = in.nextKeyValue();
        if (!notDone) {
            return (null);
        }

        /*
          check the id of the sequence to see if its a paired read
         */
        String seqid = (in.getCurrentKey()).toString();
        String seqkey = null;
        String seqkey2;
        String header = "";
        String direction;
        for (int i = 0; i < seqid.length(); i++) {
            if (seqid.charAt(i) == ' ' || seqid.charAt(i) == '\t') {
                seqkey = seqid.substring(0, i);
                header = seqid.substring(i, seqid.length());
                break;
            }
        }
        if (seqkey == null)
            seqkey = seqid;
        if (seqkey.indexOf("/") >= 0) {
            String[] a = seqkey.split("/");
            seqkey2 = a[0];
            direction = a[1];
        } else {
            seqkey2 = seqkey;
            direction = "0";
        }
        Text value = ((Text) in.getCurrentValue());
        mProtoTuple.add(new DataByteArray(seqkey2.getBytes(), 0, seqkey2.length())); // add key
        mProtoTuple.add(new DataByteArray(direction.getBytes(), 0, direction.length())); // add direction
        mProtoTuple.add(new DataByteArray(value.getBytes(), 0, value.getLength())); // add sequence
        mProtoTuple.add(new DataByteArray(header.getBytes(), 0, header.length())); // add header

        Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
        mProtoTuple = null;
        return (t);
    } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }
}

From source file:gov.jgi.meta.sequence.SequenceString.java

License:Open Source License

public static String byteArrayToSequence(Text seq) {
    init();//from w  ww  .j a va 2  s  .  c o  m

    StringBuffer sb = new StringBuffer();
    byte[] ba = seq.getBytes();

    for (int i = 0; i < seq.getLength(); i++) {
        sb.append(reverseHash.get(ba[i]));
    }

    return sb.toString();
}

From source file:hivemall.fm.FFMPredictUDF.java

License:Apache License

@Override
public Object evaluate(DeferredObject[] args) throws HiveException {
    String modelId = _modelIdOI.getPrimitiveJavaObject(args[0].get());
    if (modelId == null) {
        throw new HiveException("modelId is not set");
    }/*w  w w . j a  v a2  s. c  o m*/

    final FFMPredictionModel model;
    if (modelId.equals(_cachedModeId)) {
        model = this._cachedModel;
    } else {
        Text serModel = _modelOI.getPrimitiveWritableObject(args[1].get());
        if (serModel == null) {
            throw new HiveException("Model is null for model ID: " + modelId);
        }
        byte[] b = serModel.getBytes();
        final int length = serModel.getLength();
        try {
            model = FFMPredictionModel.deserialize(b, length);
            b = null;
        } catch (ClassNotFoundException e) {
            throw new HiveException(e);
        } catch (IOException e) {
            throw new HiveException(e);
        }
        this._cachedModeId = modelId;
        this._cachedModel = model;
    }

    int numFeatures = model.getNumFeatures();
    int numFields = model.getNumFields();

    Object arg2 = args[2].get();
    // [workaround]
    // java.lang.ClassCastException: org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryArray
    // cannot be cast to [Ljava.lang.Object;
    if (arg2 instanceof LazyBinaryArray) {
        arg2 = ((LazyBinaryArray) arg2).getList();
    }
    Feature[] x = Feature.parseFFMFeatures(arg2, _featureListOI, _probes, numFeatures, numFields);
    if (x == null || x.length == 0) {
        return null; // return NULL if there are no features
    }
    this._probes = x;

    double predicted = predict(x, model);
    _result.set(predicted);
    return _result;
}

From source file:hivemall.mix.MixMessageEncoder.java

License:Open Source License

private static void encodeObject(final Object obj, final ByteBuf buf) throws IOException {
    assert (obj != null);
    if (obj instanceof Integer) {
        Integer i = (Integer) obj;
        buf.writeByte(INTEGER_TYPE);//from  ww w .  j  a  v a2 s  .  co  m
        buf.writeInt(i.intValue());
    } else if (obj instanceof Text) {
        Text t = (Text) obj;
        byte[] b = t.getBytes();
        int length = t.getLength();
        buf.writeByte(TEXT_TYPE);
        buf.writeInt(length);
        buf.writeBytes(b, 0, length);
    } else if (obj instanceof String) {
        String s = (String) obj;
        buf.writeByte(STRING_TYPE);
        writeString(s, buf);
    } else if (obj instanceof IntWritable) {
        IntWritable i = (IntWritable) obj;
        buf.writeByte(INT_WRITABLE_TYPE);
        buf.writeInt(i.get());
    } else if (obj instanceof LongWritable) {
        LongWritable l = (LongWritable) obj;
        buf.writeByte(LONG_WRITABLE_TYPE);
        buf.writeLong(l.get());
    } else {
        throw new IllegalStateException("Unexpected type: " + obj.getClass().getName());
    }
}

From source file:hivemall.sketch.bloom.BloomFilterUtils.java

License:Apache License

@Nonnull
public static <F extends Filter> F deserialize(@Nonnull final Text in, @Nonnull final F dst)
        throws IOException {
    return deserialize(in.getBytes(), 0, in.getLength(), dst);
}

From source file:hivemall.tools.compress.DeflateUDF.java

License:Apache License

@Override
public BytesWritable evaluate(DeferredObject[] arguments) throws HiveException {
    if (codec == null) {
        this.codec = new DeflateCodec(true, false);
    }/*from  www  .java2  s  .com*/

    Object arg0 = arguments[0].get();
    if (arg0 == null) {
        return null;
    }
    Text text = stringOI.getPrimitiveWritableObject(arg0);
    byte[] original = text.getBytes();
    final int len = text.getLength();
    final byte[] compressed;
    try {
        compressed = codec.compress(original, 0, len, compressionLevel);
    } catch (IOException e) {
        throw new HiveException("Failed to compress", e);
    }
    original = null;
    if (result == null) {
        this.result = new BytesWritable(compressed);
    } else {
        result.set(compressed, 0, compressed.length);
    }
    return result;
}

From source file:hivemall.tools.text.Unbase91UDF.java

License:Apache License

@Override
public BytesWritable evaluate(DeferredObject[] arguments) throws HiveException {
    if (outputBuf == null) {
        this.outputBuf = new FastByteArrayOutputStream(4096);
    } else {// w  w  w  .ja  v a  2s . c om
        outputBuf.reset();
    }

    Object arg0 = arguments[0].get();
    if (arg0 == null) {
        return null;
    }

    Text input = stringOI.getPrimitiveWritableObject(arg0);
    final byte[] inputBytes = input.getBytes();
    final int len = input.getLength();
    try {
        Base91.decode(inputBytes, 0, len, outputBuf);
    } catch (IOException e) {
        throw new HiveException(e);
    }

    if (result == null) {
        byte[] outputBytes = outputBuf.toByteArray();
        this.result = new BytesWritable(outputBytes);
    } else {
        byte[] outputBytes = outputBuf.getInternalArray();
        int outputSize = outputBuf.size();
        result.set(outputBytes, 0, outputSize);
    }
    return result;
}

From source file:io.aos.hdfs.StringTextComparisonTest.java

License:Apache License

@Test
public void text() {

    Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");
    assertThat(t.getLength(), is(10));

    assertThat(t.find("\u0041"), is(0));
    assertThat(t.find("\u00DF"), is(1));
    assertThat(t.find("\u6771"), is(3));
    assertThat(t.find("\uD801\uDC00"), is(6));

    assertThat(t.charAt(0), is(0x0041));
    assertThat(t.charAt(1), is(0x00DF));
    assertThat(t.charAt(3), is(0x6771));
    assertThat(t.charAt(6), is(0x10400));
}

From source file:io.aos.hdfs.TextIterator.java

License:Apache License

public static void main(String... args) {
    Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");

    ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength());
    int cp;/*w w  w.  ja v a2  s .  c  o m*/
    while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) {
        System.out.println(Integer.toHexString(cp));
    }
}