Example usage for org.apache.hadoop.io Text compareTo

List of usage examples for org.apache.hadoop.io Text compareTo

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text compareTo.

Prototype

public int compareTo(byte[] other, int off, int len) 

Source Link

Document

Compare bytes from {#getBytes()} to those provided.

Usage

From source file:edu.isi.mavuno.extract.ChunkExtractor.java

License:Apache License

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();/* w w  w . ja  v a  2  s. c om*/

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // extract chunks from sentence
    mChunks.clear();
    mChunkTokens.clear();
    List<TratzParsedTokenWritable> tokens = sentence.getTokens();
    Text lastNETag = new Text();
    for (int i = 0; i < tokens.size(); i++) {
        TratzParsedTokenWritable t = tokens.get(i);
        byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0;
        Text neTag = t.getNETag();
        if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0
                || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O'))
                        && (chunkType == 'B' || chunkType == 'O')) {
            if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') {
                Text chunk = createChunk(mChunkTokens, mChunkType);
                mChunks.add(chunk);
            }
            mChunkTokens.clear();
            mChunkType.set(t.getChunkTag());
        }
        mChunkTokens.add(t.getToken());
        lastNETag.set(neTag);
    }

    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') {
        Text chunk = createChunk(mChunkTokens, mChunkType);
        mChunks.add(chunk);
    }

    // generate adjacent (context, pattern) pairs
    for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) {
        Text leftPattern = new Text();
        leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength());
        leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1));

        Text rightPattern = new Text();
        rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength());
        addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1));
    }

    // generate non-adjacent (context, pattern) pairs based on chunks
    for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) {
        for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) {
            if (patternPos - leftSkip - 1 < 0) {
                continue;
            }

            if (mOrContextStyle && !mRightOnlyContextStyle) {
                addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos),
                        ContextPatternWritable.ASTERISK);
            }

            if (mOrContextStyle && mLeftOnlyContextStyle) {
                continue;
            }

            for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) {
                if (patternPos + rightSkip + 1 >= mChunks.size()) {
                    continue;
                }

                // construct (context, pattern) pair
                if (mOrContextStyle) {
                    addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos),
                            mChunks.get(patternPos + rightSkip + 1));
                } else {
                    addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos),
                            mChunks.get(patternPos + rightSkip + 1));
                }
            }
        }
    }

    // get iterator
    mChunkPairsIter = mChunkPairs.iterator();
}