Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:edu.isi.mavuno.extract.DIRTExtractor.java

License:Apache License

private List<ContextPatternWritable> getContext(ArrayListOfInts leftPath, ArrayListOfInts rightPath,
        List<TratzParsedTokenWritable> tokens, Text[] chunks) { //, int leftContextSize, int rightContextSize) {
    // construct (context, pattern) pairs
    List<ContextPatternWritable> contexts = new ArrayList<ContextPatternWritable>();

    // make sure that the dimensions are feasible
    if (leftPath.size() < 1 || rightPath.size() < 1) {
        return contexts;
    }//  w  w w.java2  s .  co  m

    // make sure we don't split the left context's chunk
    Text leftChunk = chunks[leftPath.get(0) - 1];
    for (int i = 1; i <= leftPath.size() - 1; i++) {
        if (chunks[leftPath.get(i) - 1].equals(leftChunk)) {
            return contexts;
        }
    }

    // make sure we don't split the right context's chunk
    Text rightChunk = chunks[rightPath.get(0) - 1];
    for (int i = rightPath.size() - 1; i >= 1; i--) {
        if (chunks[rightPath.get(i) - 1].equals(rightChunk)) {
            return contexts;
        }
    }

    TratzParsedTokenWritable t;
    Text term, posTag, dependType;

    // construct pattern based on the parse tree path
    final Text pattern = new Text();

    // encode left context chunk type
    // TODO: replace this with a more robust way of checking if this is an actual named entity or not
    Text leftNETag = tokens.get(leftPath.get(0) - 1).getNETag();
    Text leftChunkTag = tokens.get(leftPath.get(0) - 1).getChunkTag();
    if (leftNETag.getLength() != 1 || (leftNETag.getLength() > 0 && leftNETag.getBytes()[0] != 'O')) {
        pattern.append(leftNETag.getBytes(), 0, leftNETag.getLength());
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    } else {
        if (leftChunkTag.getLength() > 2) {
            pattern.append(leftChunkTag.getBytes(), 2, leftChunkTag.getLength() - 2);
        } else {
            pattern.append(leftChunkTag.getBytes(), 0, leftChunkTag.getLength());
        }
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    }

    // left path portion of pattern
    if (!mHeadOnly) {
        for (int i = 0; i <= leftPath.size() - 2; i++) {
            t = tokens.get(leftPath.get(i) - 1);

            term = mUseLemmas ? t.getLemma() : t.getToken();
            dependType = t.getDependType();
            posTag = t.getPosTag();

            if (i != 0) {
                pattern.append(term.getBytes(), 0, term.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
            }

            pattern.append(dependType.getBytes(), 0, dependType.getLength());
            pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);

        }
    } else {
        dependType = tokens.get(leftPath.get(0) - 1).getDependType();
        posTag = tokens.get(leftPath.get(0) - 1).getPosTag();

        pattern.append(dependType.getBytes(), 0, dependType.getLength());
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    }

    // root portion of pattern
    if (leftPath.get(leftPath.size() - 1) != rightPath.get(rightPath.size() - 1)) {
        throw new RuntimeException(
                "Left and right paths do not share the same root! -- " + leftPath + " <--> " + rightPath);
    }

    t = tokens.get(leftPath.get(leftPath.size() - 1) - 1);

    term = mUseLemmas ? t.getLemma() : t.getToken();
    dependType = t.getDependType();
    posTag = t.getPosTag();

    pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
    pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    pattern.append(term.getBytes(), 0, term.getLength());
    pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());

    // right path portion of pattern
    if (!mHeadOnly) {
        for (int i = rightPath.size() - 2; i >= 0; i--) {
            t = tokens.get(rightPath.get(i) - 1);

            term = mUseLemmas ? t.getLemma() : t.getToken();
            dependType = t.getDependType();
            posTag = t.getPosTag();

            pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
            pattern.append(dependType.getBytes(), 0, dependType.getLength());

            if (i != 0) {
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(term.getBytes(), 0, term.getLength());
            }
        }
    } else {
        dependType = tokens.get(rightPath.get(0) - 1).getDependType();
        posTag = tokens.get(rightPath.get(0) - 1).getPosTag();

        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        pattern.append(dependType.getBytes(), 0, dependType.getLength());
    }

    // encode right context chunk type
    // TODO: replace this with a more robust way of checking if this is an actual named entity or not
    Text rightNETag = tokens.get(rightPath.get(0) - 1).getNETag();
    Text rightChunkTag = tokens.get(rightPath.get(0) - 1).getChunkTag();
    if (rightNETag.getLength() != 1 || (rightNETag.getLength() > 0 && rightNETag.getBytes()[0] != 'O')) {
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        pattern.append(rightNETag.getBytes(), 0, rightNETag.getLength());
    } else {
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        if (rightChunkTag.getLength() > 2) {
            pattern.append(rightChunkTag.getBytes(), 2, rightChunkTag.getLength() - 2);
        } else {
            pattern.append(rightChunkTag.getBytes(), 0, rightChunkTag.getLength());
        }
    }

    if (mOrContextStyle) {
        if (!mRightOnlyContextStyle) {
            ContextPatternWritable c = new ContextPatternWritable();
            c.setContext(MavunoUtils.createContext(leftChunk, MavunoUtils.ASTERISK));
            c.setPattern(pattern);
            contexts.add(c);
        }
        if (!mLeftOnlyContextStyle) {
            ContextPatternWritable c = new ContextPatternWritable();
            c.setContext(MavunoUtils.createContext(MavunoUtils.ASTERISK, rightChunk));
            c.setPattern(pattern);
            contexts.add(c);
        }
    } else {
        ContextPatternWritable c = new ContextPatternWritable();
        c.setContext(MavunoUtils.createContext(leftChunk, rightChunk));
        c.setPattern(pattern);
        contexts.add(c);
    }

    return contexts;
}

From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java

License:Apache License

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();/*from  w  ww. j a v  a 2s  .  c  om*/

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // get chunk ids
    List<TratzParsedTokenWritable> sentenceTokens = sentence.getTokens();
    int[] chunkIds = NLProcTools.getChunkIds(sentenceTokens);

    mChunks.clear();
    mChunkTokens.clear();

    // extract chunks from sentence
    for (int i = 0; i < chunkIds.length; i++) {
        if (i > 0 && chunkIds[i] != chunkIds[i - 1]) {
            Chunk chunk = createChunk(mChunkTokens);
            mChunks.add(chunk);
            mChunkTokens.clear();
        }
        mChunkTokens.add(sentenceTokens.get(i));
    }

    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) {
        Chunk chunk = createChunk(mChunkTokens);
        mChunks.add(chunk);
    }

    // there's nothing we can do if there aren't at least mArity chunks in the sentence
    if (mArity > mChunks.size()) {
        mChunkPairsIter = mChunkPairs.iterator();
        return;
    }

    // initialize context positions
    for (int i = 0; i < mArity; i++) {
        mContextPositions[i] = i;
    }

    // initialize pattern positions
    for (int i = 0; i < mArity - 1; i++) {
        mPatternPositions[i] = i;
    }

    // generate (context, pattern) pairs based on chunks
    final Text basePattern = new Text();
    while (true) {
        // construct context
        for (int i = 0; i < mArity; i++) {
            mContextChunks[i] = mChunks.get(mContextPositions[i]);
        }

        // construct pattern
        for (int i = 0; i < mArity - 1; i++) {
            mPatternChunks[i] = mChunks.get(mPatternPositions[i]);
        }

        // add (context, pattern) pair
        basePattern.clear();
        for (int i = 0; i < mArity - 1; i++) {
            // left chunk type
            basePattern.append(mContextChunks[i].type.getBytes(), 0, mContextChunks[i].type.getLength());
            basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);

            if (mContextPositions[i + 1] - mPatternPositions[i] > 1
                    || mPatternPositions[i] - mContextPositions[i] > 1) {
                if (mPatternPositions[i] == mContextPositions[i]) {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                } else if (mPatternPositions[i] == mContextPositions[i] + 1) {
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                } else if (mPatternPositions[i] + 1 == mContextPositions[i + 1]) {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                } else {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                }
            } else if (mPatternPositions[i] == mContextPositions[i]) {
                basePattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
            } else {
                basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
            }
            basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        }

        // last chunk type
        basePattern.append(mContextChunks[mArity - 1].type.getBytes(), 0,
                mContextChunks[mArity - 1].type.getLength());
        basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);

        int[] indices;
        mPermGen.reset();
        while (mPermGen.hasMore()) {
            // get next permutation
            indices = mPermGen.getNext();

            ContextPatternWritable c = new ContextPatternWritable();

            // pattern
            c.setPattern(basePattern);
            Text numLeftText = new Text(mPermGen.getNumLeft() + "/" + mArity);
            c.getPattern().append(numLeftText.getBytes(), 0, numLeftText.getLength());

            // context
            c.getContext().clear();
            for (int i = 0; i < mArity; i++) {
                c.getContext().append(mContextChunks[indices[i]].text.getBytes(), 0,
                        mContextChunks[indices[i]].text.getLength());
                if (i != mArity - 1) {
                    c.getContext().append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                }
            }

            // add to chunk pairs
            mChunkPairs.add(c);
        }

        // get next set of context and pattern positions
        int pos = mArity - 2;
        while (pos >= 0) {
            if (mPatternPositions[pos] + 1 < mChunks.size()
                    && mPatternPositions[pos] + 1 < mContextPositions[pos + 1]) {
                mPatternPositions[pos]++;
                for (int i = pos + 1; i < mArity - 2; i++) {
                    mPatternPositions[i] = mContextPositions[i];
                }
                break;
            }
            pos--;
        }

        // update the context positions if the pattern positions can't be updated any further
        if (pos < 0) {
            pos = mArity - 1;
            while (pos >= 0) {
                if (mContextPositions[pos] + 1 < mChunks.size()
                        && (pos + 1 >= mArity || mContextPositions[pos + 1] - (mContextPositions[pos] + 1) >= 1)
                        && (pos <= 0
                                || mContextPositions[pos] - mContextPositions[pos - 1] - 1 <= mMaxSkipSize)) {
                    mContextPositions[pos]++;
                    if (pos < mArity - 1) {
                        mPatternPositions[pos] = mContextPositions[pos];
                    }

                    for (int i = pos + 1; i < mArity; i++) {
                        mContextPositions[i] = mContextPositions[pos] + (i - pos);
                        if (i < mArity - 1) {
                            mPatternPositions[i] = mContextPositions[i];
                        }
                    }

                    break;
                }
                pos--;
            }

            // if neither the context nor the pattern positions can be updated then we're done
            if (pos < 0) {
                // get iterator
                mChunkPairsIter = mChunkPairs.iterator();
                return;
            }
        }
    }
}

From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java

License:Apache License

private Chunk createChunk(List<TratzParsedTokenWritable> terms) {
    Chunk chunk = new Chunk();

    // construct the chunk text and detect the chunk type
    Text chunkNEType = null;/*from   ww w  .j  a v  a 2  s. com*/
    for (int i = 0; i < terms.size(); i++) {
        Text term = terms.get(i).getToken();
        Text neTag = terms.get(i).getNETag();

        chunk.text.append(term.getBytes(), 0, term.getLength());

        if (i != terms.size() - 1) {
            chunk.text.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }

        // TODO: replace this with a more robust way of checking if this is an actual named entity or not
        if (neTag.getLength() != 1 || (neTag.getLength() > 0 && neTag.getBytes()[0] != 'O')) {
            chunkNEType = neTag;
        }
    }

    // set the chunk type (note that this is somewhat heuristic)
    if (chunkNEType != null) { // chunk type = named entity type, if present
        chunk.type.set(chunkNEType);
    } else { // otherwise, chunk type = chunk tag
        Text chunkTag = terms.get(0).getChunkTag();
        if (chunkTag.getLength() > 2) {
            chunk.type.set(chunkTag.getBytes(), 2, chunkTag.getLength() - 2);
        } else {
            chunk.type.set(chunkTag);
        }
    }

    return chunk;
}

From source file:edu.isi.mavuno.nlp.NLProcTools.java

License:Apache License

private static Text extractGeneralizedChunk(int chunkStart, int chunkEnd, List<TratzParsedTokenWritable> tokens,
        boolean appendPOSTag) {
    Text chunk = new Text();

    int startPos;
    for (startPos = chunkEnd; startPos >= chunkStart; startPos--) {
        if (!tokens.get(startPos).getPosTag().toString().startsWith("NN")) {
            break;
        }//from w  ww  . j ava  2  s  . co m
    }

    if (startPos != chunkStart - 1 && startPos != chunkEnd) {
        Text generalChunk = extractMainChunk(startPos + 1, chunkEnd + 1, tokens, appendPOSTag);
        if (generalChunk != null) {
            chunk.set(generalChunk);
        }
    }

    // return null if no valid terms are found
    if (chunk.getLength() == 0) {
        return null;
    }

    return chunk;
}

From source file:edu.mit.ll.graphulo.pig.backend.GraphuloOneTableStorage.java

License:Apache License

@Override
protected Tuple getTuple(Key key, Value value) throws IOException {
    SortedMap<Key, Value> rowKVs = WholeRowIterator.decodeRow(key, value);
    Tuple tuple = TupleFactory.getInstance().newTuple(columns.size() + 1);

    final Text cfHolder = new Text();
    final Text cqHolder = new Text();
    final Text row = key.getRow();
    int tupleOffset = 0;

    tuple.set(tupleOffset, new DataByteArray(Text.decode(row.getBytes(), 0, row.getLength())));

    for (Column column : this.columns) {
        tupleOffset++;/*  w w  w.  ja v  a2s . c  om*/

        switch (column.getType()) {
        case LITERAL:
            cfHolder.set(column.getColumnFamily());
            if (null != column.getColumnQualifier()) {
                cqHolder.set(column.getColumnQualifier());
            } else {
                cqHolder.set(EMPTY_TEXT);
            }

            // Get the key where our literal would exist (accounting for
            // "colf:colq" or "colf:" empty colq)
            Key literalStartKey = new Key(row, cfHolder, cqHolder);

            SortedMap<Key, Value> tailMap = rowKVs.tailMap(literalStartKey);

            // Find the element
            if (tailMap.isEmpty()) {
                tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY);
            } else {
                Key actualKey = tailMap.firstKey();

                // Only place it in the tuple if it matches the user
                // request, avoid using a value from a
                // key with the wrong colqual
                if (0 == literalStartKey.compareTo(actualKey, PartialKey.ROW_COLFAM_COLQUAL)) {
                    tuple.set(tupleOffset, new DataByteArray(tailMap.get(actualKey).get()));
                } else {
                    // This row doesn't have the column we were looking for
                    tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY);
                }
            }

            break;
        case COLFAM_PREFIX:
            cfHolder.set(column.getColumnFamily());
            Range colfamPrefixRange = Range.prefix(row, cfHolder);
            Key colfamPrefixStartKey = new Key(row, cfHolder);

            SortedMap<Key, Value> cfTailMap = rowKVs.tailMap(colfamPrefixStartKey);

            // Find the element
            if (cfTailMap.isEmpty()) {
                tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY);
            } else {
                HashMap<String, DataByteArray> tupleMap = new HashMap<String, DataByteArray>();

                // Build up a map for all the entries in this row that match
                // the colfam prefix
                for (Entry<Key, Value> entry : cfTailMap.entrySet()) {
                    if (colfamPrefixRange.contains(entry.getKey())) {
                        entry.getKey().getColumnFamily(cfHolder);
                        entry.getKey().getColumnQualifier(cqHolder);
                        DataByteArray val = new DataByteArray(entry.getValue().get());

                        // Avoid adding an extra ':' when colqual is empty
                        if (0 == cqHolder.getLength()) {
                            tupleMap.put(cfHolder.toString(), val);
                        } else {
                            tupleMap.put(cfHolder.toString() + COLON + cqHolder.toString(), val);
                        }
                    } else {
                        break;
                    }
                }

                if (!tupleMap.isEmpty()) {
                    tuple.set(tupleOffset, tupleMap);
                }
            }

            break;
        case COLQUAL_PREFIX:
            cfHolder.set(column.getColumnFamily());
            cqHolder.set(column.getColumnQualifier());
            Range colqualPrefixRange = Range.prefix(row, cfHolder, cqHolder);
            Key colqualPrefixStartKey = new Key(row, cfHolder, cqHolder);

            SortedMap<Key, Value> cqTailMap = rowKVs.tailMap(colqualPrefixStartKey);
            if (cqTailMap.isEmpty()) {
                tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY);
            } else {
                HashMap<String, DataByteArray> tupleMap = new HashMap<String, DataByteArray>();

                // Build up a map for all the entries in this row that match
                // the colqual prefix
                for (Entry<Key, Value> entry : cqTailMap.entrySet()) {
                    if (colqualPrefixRange.contains(entry.getKey())) {
                        entry.getKey().getColumnFamily(cfHolder);
                        entry.getKey().getColumnQualifier(cqHolder);
                        DataByteArray val = new DataByteArray(entry.getValue().get());

                        // Avoid the extra ':' on empty colqual
                        if (0 == cqHolder.getLength()) {
                            tupleMap.put(cfHolder.toString(), val);
                        } else {
                            tupleMap.put(cfHolder.toString() + COLON + cqHolder.toString(), val);
                        }
                    } else {
                        break;
                    }
                }

                if (!tupleMap.isEmpty()) {
                    tuple.set(tupleOffset, tupleMap);
                }
            }

            break;
        default:
            break;
        }
    }

    return tuple;
}

From source file:edu.uci.ics.hivesterix.serde.lazy.LazySerDe.java

License:Apache License

/**
 * Deserialize a table record to a Lazy struct.
 */// w  ww  . j ava2s .  c  om
@SuppressWarnings("deprecation")
@Override
public Object deserialize(Writable field) throws SerDeException {
    if (byteArrayRef == null) {
        byteArrayRef = new ByteArrayRef();
    }
    if (field instanceof BytesWritable) {
        BytesWritable b = (BytesWritable) field;
        if (b.getSize() == 0) {
            return null;
        }
        // For backward-compatibility with hadoop 0.17
        byteArrayRef.setData(b.get());
        cachedLazyStruct.init(byteArrayRef.getData(), 0, b.getSize());
    } else if (field instanceof Text) {
        Text t = (Text) field;
        if (t.getLength() == 0) {
            return null;
        }
        byteArrayRef.setData(t.getBytes());
        cachedLazyStruct.init(byteArrayRef.getData(), 0, t.getLength());
    } else {
        throw new SerDeException(getClass().toString() + ": expects either BytesWritable or Text object!");
    }
    return cachedLazyStruct;
}

From source file:edu.uci.ics.hivesterix.serde.lazy.LazySerDe.java

License:Apache License

/**
 * A recursive function that serialize an object to a byte buffer based on
 * its object inspector.//from   ww w .j a  v  a  2s.co  m
 * 
 * @param byteStream
 *            the byte stream storing the serialization data
 * @param obj
 *            the object to serialize
 * @param objInspector
 *            the object inspector
 */
private void serialize(Output byteStream, Object obj, ObjectInspector objInspector) {

    // do nothing for null object
    if (null == obj) {
        return;
    }

    switch (objInspector.getCategory()) {
    case PRIMITIVE: {
        PrimitiveObjectInspector poi = (PrimitiveObjectInspector) objInspector;
        switch (poi.getPrimitiveCategory()) {
        case VOID: {
            return;
        }
        case BOOLEAN: {
            boolean v = ((BooleanObjectInspector) poi).get(obj);
            byteStream.write((byte) (v ? 1 : 0));
            return;
        }
        case BYTE: {
            ByteObjectInspector boi = (ByteObjectInspector) poi;
            byte v = boi.get(obj);
            byteStream.write(v);
            return;
        }
        case SHORT: {
            ShortObjectInspector spoi = (ShortObjectInspector) poi;
            short v = spoi.get(obj);
            byteStream.write((byte) (v >> 8));
            byteStream.write((byte) (v));
            return;
        }
        case INT: {
            IntObjectInspector ioi = (IntObjectInspector) poi;
            int v = ioi.get(obj);
            LazyUtils.writeVInt(byteStream, v);
            return;
        }
        case LONG: {
            LongObjectInspector loi = (LongObjectInspector) poi;
            long v = loi.get(obj);
            LazyUtils.writeVLong(byteStream, v);
            return;
        }
        case FLOAT: {
            FloatObjectInspector foi = (FloatObjectInspector) poi;
            int v = Float.floatToIntBits(foi.get(obj));
            byteStream.write((byte) (v >> 24));
            byteStream.write((byte) (v >> 16));
            byteStream.write((byte) (v >> 8));
            byteStream.write((byte) (v));
            return;
        }
        case DOUBLE: {
            DoubleObjectInspector doi = (DoubleObjectInspector) poi;
            long v = Double.doubleToLongBits(doi.get(obj));
            byteStream.write((byte) (v >> 56));
            byteStream.write((byte) (v >> 48));
            byteStream.write((byte) (v >> 40));
            byteStream.write((byte) (v >> 32));
            byteStream.write((byte) (v >> 24));
            byteStream.write((byte) (v >> 16));
            byteStream.write((byte) (v >> 8));
            byteStream.write((byte) (v));
            return;
        }
        case STRING: {
            StringObjectInspector soi = (StringObjectInspector) poi;
            Text t = soi.getPrimitiveWritableObject(obj);
            /* write byte size of the string which is a vint */
            int length = t.getLength();
            LazyUtils.writeVInt(byteStream, length);
            /* write string itself */
            byte[] data = t.getBytes();
            byteStream.write(data, 0, length);
            return;
        }
        default: {
            throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory());
        }
        }
    }
    case LIST: {
        ListObjectInspector loi = (ListObjectInspector) objInspector;
        ObjectInspector eoi = loi.getListElementObjectInspector();

        // 1/ reserve spaces for the byte size of the list
        // which is a integer and takes four bytes
        int byteSizeStart = byteStream.getCount();
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        int listStart = byteStream.getCount();

        // 2/ write the size of the list as a VInt
        int size = loi.getListLength(obj);
        LazyUtils.writeVInt(byteStream, size);

        // 3/ write the null bytes
        byte nullByte = 0;
        for (int eid = 0; eid < size; eid++) {
            // set the bit to 1 if an element is not null
            if (null != loi.getListElement(obj, eid)) {
                nullByte |= 1 << (eid % 8);
            }
            // store the byte every eight elements or
            // if this is the last element
            if (7 == eid % 8 || eid == size - 1) {
                byteStream.write(nullByte);
                nullByte = 0;
            }
        }

        // 4/ write element by element from the list
        for (int eid = 0; eid < size; eid++) {
            serialize(byteStream, loi.getListElement(obj, eid), eoi);
        }

        // 5/ update the list byte size
        int listEnd = byteStream.getCount();
        int listSize = listEnd - listStart;
        byte[] bytes = byteStream.getData();
        bytes[byteSizeStart] = (byte) (listSize >> 24);
        bytes[byteSizeStart + 1] = (byte) (listSize >> 16);
        bytes[byteSizeStart + 2] = (byte) (listSize >> 8);
        bytes[byteSizeStart + 3] = (byte) (listSize);

        return;
    }
    case MAP: {
        MapObjectInspector moi = (MapObjectInspector) objInspector;
        ObjectInspector koi = moi.getMapKeyObjectInspector();
        ObjectInspector voi = moi.getMapValueObjectInspector();
        Map<?, ?> map = moi.getMap(obj);

        // 1/ reserve spaces for the byte size of the map
        // which is a integer and takes four bytes
        int byteSizeStart = byteStream.getCount();
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        int mapStart = byteStream.getCount();

        // 2/ write the size of the map which is a VInt
        int size = map.size();
        LazyUtils.writeVInt(byteStream, size);

        // 3/ write the null bytes
        int b = 0;
        byte nullByte = 0;
        for (Map.Entry<?, ?> entry : map.entrySet()) {
            // set the bit to 1 if a key is not null
            if (null != entry.getKey()) {
                nullByte |= 1 << (b % 8);
            } else if (!nullMapKey) {
                nullMapKey = true;
                LOG.warn("Null map key encountered! Ignoring similar problems.");
            }
            b++;
            // set the bit to 1 if a value is not null
            if (null != entry.getValue()) {
                nullByte |= 1 << (b % 8);
            }
            b++;
            // write the byte to stream every 4 key-value pairs
            // or if this is the last key-value pair
            if (0 == b % 8 || b == size * 2) {
                byteStream.write(nullByte);
                nullByte = 0;
            }
        }

        // 4/ write key-value pairs one by one
        for (Map.Entry<?, ?> entry : map.entrySet()) {
            serialize(byteStream, entry.getKey(), koi);
            serialize(byteStream, entry.getValue(), voi);
        }

        // 5/ update the byte size of the map
        int mapEnd = byteStream.getCount();
        int mapSize = mapEnd - mapStart;
        byte[] bytes = byteStream.getData();
        bytes[byteSizeStart] = (byte) (mapSize >> 24);
        bytes[byteSizeStart + 1] = (byte) (mapSize >> 16);
        bytes[byteSizeStart + 2] = (byte) (mapSize >> 8);
        bytes[byteSizeStart + 3] = (byte) (mapSize);

        return;
    }
    case STRUCT: {
        // 1/ reserve spaces for the byte size of the struct
        // which is a integer and takes four bytes
        int byteSizeStart = byteStream.getCount();
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        byteStream.write((byte) 0);
        int structStart = byteStream.getCount();

        // 2/ serialize the struct
        serializeStruct(byteStream, obj, (StructObjectInspector) objInspector);

        // 3/ update the byte size of the struct
        int structEnd = byteStream.getCount();
        int structSize = structEnd - structStart;
        byte[] bytes = byteStream.getData();
        bytes[byteSizeStart] = (byte) (structSize >> 24);
        bytes[byteSizeStart + 1] = (byte) (structSize >> 16);
        bytes[byteSizeStart + 2] = (byte) (structSize >> 8);
        bytes[byteSizeStart + 3] = (byte) (structSize);

        return;
    }
    default: {
        throw new RuntimeException("Unrecognized type: " + objInspector.getCategory());
    }
    }
}

From source file:edu.uci.ics.hyracks.hdfs.lib.TextKeyValueParserFactory.java

License:Apache License

@Override
public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx)
        throws HyracksDataException {

    final ArrayTupleBuilder tb = new ArrayTupleBuilder(1);
    final ByteBuffer buffer = ctx.allocateFrame();
    final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize());
    appender.reset(buffer, true);// www  .j  ava 2  s .  c  o m

    return new IKeyValueParser<LongWritable, Text>() {

        @Override
        public void open(IFrameWriter writer) {

        }

        @Override
        public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString)
                throws HyracksDataException {
            tb.reset();
            tb.addField(value.getBytes(), 0, value.getLength());
            if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                FrameUtils.flushFrame(buffer, writer);
                appender.reset(buffer, true);
                if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                    throw new HyracksDataException("tuple cannot be appended into the frame");
                }
            }
        }

        @Override
        public void close(IFrameWriter writer) throws HyracksDataException {
            FrameUtils.flushFrame(buffer, writer);
        }

    };
}

From source file:edu.uci.ics.hyracks.imru.dataflow.Hdtest.java

License:Apache License

public static JobSpecification createJob() throws Exception {
    JobSpecification spec = new JobSpecification();
    spec.setFrameSize(4096);//from   w w  w  . j  a  v  a  2 s . com

    String PATH_TO_HADOOP_CONF = "/home/wangrui/a/imru/hadoop-0.20.2/conf";
    String HDFS_INPUT_PATH = "/customer/customer.tbl,/customer_result/part-0";
    JobConf conf = new JobConf();
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
    FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
    conf.setInputFormat(TextInputFormat.class);
    RecordDescriptor recordDesc = new RecordDescriptor(
            new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() });
    InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1);
    HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(spec, recordDesc, conf, splits,
            new String[] { "NC0", "NC1" }, new IKeyValueParserFactory<LongWritable, Text>() {
                @Override
                public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) {
                    return new IKeyValueParser<LongWritable, Text>() {
                        TupleWriter tupleWriter;

                        @Override
                        public void open(IFrameWriter writer) throws HyracksDataException {
                            tupleWriter = new TupleWriter(ctx, writer, 1);
                        }

                        @Override
                        public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString)
                                throws HyracksDataException {
                            try {
                                tupleWriter.write(value.getBytes(), 0, value.getLength());
                                tupleWriter.finishField();
                                tupleWriter.finishTuple();
                            } catch (IOException e) {
                                throw new HyracksDataException(e);
                            }
                        }

                        @Override
                        public void close(IFrameWriter writer) throws HyracksDataException {
                            tupleWriter.close();
                        }
                    };
                }

            });

    // createPartitionConstraint(spec, readOperator, new String[] {"NC0"});
    PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, readOperator, new String[] { "NC0", "NC1" });

    IOperatorDescriptor writer = new HDFSOD(spec, null, null, null);
    // createPartitionConstraint(spec, writer, outSplits);

    spec.connect(new OneToOneConnectorDescriptor(spec), readOperator, 0, writer, 0);

    spec.addRoot(writer);
    return spec;
}

From source file:edu.uci.ics.pregelix.example.aggregator.OverflowAggregator.java

License:Apache License

@Override
public void step(Text partialResult) {
    textLength += partialResult.getLength();
}