Example usage for org.apache.hadoop.io Text append

List of usage examples for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len) 

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:edu.isi.mavuno.extract.DIRTExtractor.java

License:Apache License

private void loadDependPairs() {
    // clear dependency pairs
    mDependPairs.clear();/*w w  w  .  ja v a2s . com*/

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // get sentence tokens
    List<TratzParsedTokenWritable> tokens = sentence.getTokens();

    // get chunk ids
    int[] chunkIds = NLProcTools.getChunkIds(tokens);

    // get mapping from positions to chunks
    Text[] chunks = new Text[tokens.size()];

    Text curChunk = null;
    for (int i = 0; i < tokens.size(); i++) {
        Text text = tokens.get(i).getToken();

        if (i == 0 || (i > 0 && chunkIds[i] != chunkIds[i - 1])) {
            curChunk = new Text(text);
        } else {
            curChunk.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
            curChunk.append(text.getBytes(), 0, text.getLength());
        }

        chunks[i] = curChunk;
    }

    // populate parse tree
    ArrayListOfInts[] children = new ArrayListOfInts[tokens.size() + 1];
    for (int i = 0; i < tokens.size() + 1; i++) {
        children[i] = new ArrayListOfInts();
    }

    for (int i = 0; i < tokens.size(); i++) {
        TratzParsedTokenWritable t = tokens.get(i);

        // ignore punctuation
        if (!t.getDependType().equals(PUNCTUATION_TYPE)) {
            children[t.getDependIndex()].add(i + 1);
        }
    }

    // extract (context, pattern) pairs from parse tree
    for (int i = 0; i < children[0].size(); i++) {
        extractPairs(children, children[0].get(i), tokens, chunks);
    }

    // get iterator
    mDependPairsIter = mDependPairs.iterator();
}

From source file:edu.isi.mavuno.extract.DIRTExtractor.java

License:Apache License

private List<ContextPatternWritable> getContext(ArrayListOfInts leftPath, ArrayListOfInts rightPath,
        List<TratzParsedTokenWritable> tokens, Text[] chunks) { //, int leftContextSize, int rightContextSize) {
    // construct (context, pattern) pairs
    List<ContextPatternWritable> contexts = new ArrayList<ContextPatternWritable>();

    // make sure that the dimensions are feasible
    if (leftPath.size() < 1 || rightPath.size() < 1) {
        return contexts;
    }//w w  w  .j  av  a 2 s. com

    // make sure we don't split the left context's chunk
    Text leftChunk = chunks[leftPath.get(0) - 1];
    for (int i = 1; i <= leftPath.size() - 1; i++) {
        if (chunks[leftPath.get(i) - 1].equals(leftChunk)) {
            return contexts;
        }
    }

    // make sure we don't split the right context's chunk
    Text rightChunk = chunks[rightPath.get(0) - 1];
    for (int i = rightPath.size() - 1; i >= 1; i--) {
        if (chunks[rightPath.get(i) - 1].equals(rightChunk)) {
            return contexts;
        }
    }

    TratzParsedTokenWritable t;
    Text term, posTag, dependType;

    // construct pattern based on the parse tree path
    final Text pattern = new Text();

    // encode left context chunk type
    // TODO: replace this with a more robust way of checking if this is an actual named entity or not
    Text leftNETag = tokens.get(leftPath.get(0) - 1).getNETag();
    Text leftChunkTag = tokens.get(leftPath.get(0) - 1).getChunkTag();
    if (leftNETag.getLength() != 1 || (leftNETag.getLength() > 0 && leftNETag.getBytes()[0] != 'O')) {
        pattern.append(leftNETag.getBytes(), 0, leftNETag.getLength());
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    } else {
        if (leftChunkTag.getLength() > 2) {
            pattern.append(leftChunkTag.getBytes(), 2, leftChunkTag.getLength() - 2);
        } else {
            pattern.append(leftChunkTag.getBytes(), 0, leftChunkTag.getLength());
        }
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    }

    // left path portion of pattern
    if (!mHeadOnly) {
        for (int i = 0; i <= leftPath.size() - 2; i++) {
            t = tokens.get(leftPath.get(i) - 1);

            term = mUseLemmas ? t.getLemma() : t.getToken();
            dependType = t.getDependType();
            posTag = t.getPosTag();

            if (i != 0) {
                pattern.append(term.getBytes(), 0, term.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
            }

            pattern.append(dependType.getBytes(), 0, dependType.getLength());
            pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);

        }
    } else {
        dependType = tokens.get(leftPath.get(0) - 1).getDependType();
        posTag = tokens.get(leftPath.get(0) - 1).getPosTag();

        pattern.append(dependType.getBytes(), 0, dependType.getLength());
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    }

    // root portion of pattern
    if (leftPath.get(leftPath.size() - 1) != rightPath.get(rightPath.size() - 1)) {
        throw new RuntimeException(
                "Left and right paths do not share the same root! -- " + leftPath + " <--> " + rightPath);
    }

    t = tokens.get(leftPath.get(leftPath.size() - 1) - 1);

    term = mUseLemmas ? t.getLemma() : t.getToken();
    dependType = t.getDependType();
    posTag = t.getPosTag();

    pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
    pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    pattern.append(term.getBytes(), 0, term.getLength());
    pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());

    // right path portion of pattern
    if (!mHeadOnly) {
        for (int i = rightPath.size() - 2; i >= 0; i--) {
            t = tokens.get(rightPath.get(i) - 1);

            term = mUseLemmas ? t.getLemma() : t.getToken();
            dependType = t.getDependType();
            posTag = t.getPosTag();

            pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
            pattern.append(dependType.getBytes(), 0, dependType.getLength());

            if (i != 0) {
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(term.getBytes(), 0, term.getLength());
            }
        }
    } else {
        dependType = tokens.get(rightPath.get(0) - 1).getDependType();
        posTag = tokens.get(rightPath.get(0) - 1).getPosTag();

        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        pattern.append(dependType.getBytes(), 0, dependType.getLength());
    }

    // encode right context chunk type
    // TODO: replace this with a more robust way of checking if this is an actual named entity or not
    Text rightNETag = tokens.get(rightPath.get(0) - 1).getNETag();
    Text rightChunkTag = tokens.get(rightPath.get(0) - 1).getChunkTag();
    if (rightNETag.getLength() != 1 || (rightNETag.getLength() > 0 && rightNETag.getBytes()[0] != 'O')) {
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        pattern.append(rightNETag.getBytes(), 0, rightNETag.getLength());
    } else {
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        if (rightChunkTag.getLength() > 2) {
            pattern.append(rightChunkTag.getBytes(), 2, rightChunkTag.getLength() - 2);
        } else {
            pattern.append(rightChunkTag.getBytes(), 0, rightChunkTag.getLength());
        }
    }

    if (mOrContextStyle) {
        if (!mRightOnlyContextStyle) {
            ContextPatternWritable c = new ContextPatternWritable();
            c.setContext(MavunoUtils.createContext(leftChunk, MavunoUtils.ASTERISK));
            c.setPattern(pattern);
            contexts.add(c);
        }
        if (!mLeftOnlyContextStyle) {
            ContextPatternWritable c = new ContextPatternWritable();
            c.setContext(MavunoUtils.createContext(MavunoUtils.ASTERISK, rightChunk));
            c.setPattern(pattern);
            contexts.add(c);
        }
    } else {
        ContextPatternWritable c = new ContextPatternWritable();
        c.setContext(MavunoUtils.createContext(leftChunk, rightChunk));
        c.setPattern(pattern);
        contexts.add(c);
    }

    return contexts;
}

From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java

License:Apache License

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();//from w w  w. java  2s.co  m

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // get chunk ids
    List<TratzParsedTokenWritable> sentenceTokens = sentence.getTokens();
    int[] chunkIds = NLProcTools.getChunkIds(sentenceTokens);

    mChunks.clear();
    mChunkTokens.clear();

    // extract chunks from sentence
    for (int i = 0; i < chunkIds.length; i++) {
        if (i > 0 && chunkIds[i] != chunkIds[i - 1]) {
            Chunk chunk = createChunk(mChunkTokens);
            mChunks.add(chunk);
            mChunkTokens.clear();
        }
        mChunkTokens.add(sentenceTokens.get(i));
    }

    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) {
        Chunk chunk = createChunk(mChunkTokens);
        mChunks.add(chunk);
    }

    // there's nothing we can do if there aren't at least mArity chunks in the sentence
    if (mArity > mChunks.size()) {
        mChunkPairsIter = mChunkPairs.iterator();
        return;
    }

    // initialize context positions
    for (int i = 0; i < mArity; i++) {
        mContextPositions[i] = i;
    }

    // initialize pattern positions
    for (int i = 0; i < mArity - 1; i++) {
        mPatternPositions[i] = i;
    }

    // generate (context, pattern) pairs based on chunks
    final Text basePattern = new Text();
    while (true) {
        // construct context
        for (int i = 0; i < mArity; i++) {
            mContextChunks[i] = mChunks.get(mContextPositions[i]);
        }

        // construct pattern
        for (int i = 0; i < mArity - 1; i++) {
            mPatternChunks[i] = mChunks.get(mPatternPositions[i]);
        }

        // add (context, pattern) pair
        basePattern.clear();
        for (int i = 0; i < mArity - 1; i++) {
            // left chunk type
            basePattern.append(mContextChunks[i].type.getBytes(), 0, mContextChunks[i].type.getLength());
            basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);

            if (mContextPositions[i + 1] - mPatternPositions[i] > 1
                    || mPatternPositions[i] - mContextPositions[i] > 1) {
                if (mPatternPositions[i] == mContextPositions[i]) {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                } else if (mPatternPositions[i] == mContextPositions[i] + 1) {
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                } else if (mPatternPositions[i] + 1 == mContextPositions[i + 1]) {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                } else {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                }
            } else if (mPatternPositions[i] == mContextPositions[i]) {
                basePattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
            } else {
                basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
            }
            basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        }

        // last chunk type
        basePattern.append(mContextChunks[mArity - 1].type.getBytes(), 0,
                mContextChunks[mArity - 1].type.getLength());
        basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);

        int[] indices;
        mPermGen.reset();
        while (mPermGen.hasMore()) {
            // get next permutation
            indices = mPermGen.getNext();

            ContextPatternWritable c = new ContextPatternWritable();

            // pattern
            c.setPattern(basePattern);
            Text numLeftText = new Text(mPermGen.getNumLeft() + "/" + mArity);
            c.getPattern().append(numLeftText.getBytes(), 0, numLeftText.getLength());

            // context
            c.getContext().clear();
            for (int i = 0; i < mArity; i++) {
                c.getContext().append(mContextChunks[indices[i]].text.getBytes(), 0,
                        mContextChunks[indices[i]].text.getLength());
                if (i != mArity - 1) {
                    c.getContext().append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                }
            }

            // add to chunk pairs
            mChunkPairs.add(c);
        }

        // get next set of context and pattern positions
        int pos = mArity - 2;
        while (pos >= 0) {
            if (mPatternPositions[pos] + 1 < mChunks.size()
                    && mPatternPositions[pos] + 1 < mContextPositions[pos + 1]) {
                mPatternPositions[pos]++;
                for (int i = pos + 1; i < mArity - 2; i++) {
                    mPatternPositions[i] = mContextPositions[i];
                }
                break;
            }
            pos--;
        }

        // update the context positions if the pattern positions can't be updated any further
        if (pos < 0) {
            pos = mArity - 1;
            while (pos >= 0) {
                if (mContextPositions[pos] + 1 < mChunks.size()
                        && (pos + 1 >= mArity || mContextPositions[pos + 1] - (mContextPositions[pos] + 1) >= 1)
                        && (pos <= 0
                                || mContextPositions[pos] - mContextPositions[pos - 1] - 1 <= mMaxSkipSize)) {
                    mContextPositions[pos]++;
                    if (pos < mArity - 1) {
                        mPatternPositions[pos] = mContextPositions[pos];
                    }

                    for (int i = pos + 1; i < mArity; i++) {
                        mContextPositions[i] = mContextPositions[pos] + (i - pos);
                        if (i < mArity - 1) {
                            mPatternPositions[i] = mContextPositions[i];
                        }
                    }

                    break;
                }
                pos--;
            }

            // if neither the context nor the pattern positions can be updated then we're done
            if (pos < 0) {
                // get iterator
                mChunkPairsIter = mChunkPairs.iterator();
                return;
            }
        }
    }
}

From source file:edu.isi.mavuno.extract.NGramExtractor.java

License:Apache License

private boolean getSpan(Text span, Text[] terms, int start, int len) {
    if (start < 0 || start + len > terms.length) {
        return false;
    }/*from  w ww .j a va 2 s.c om*/

    span.clear();
    for (int i = start; i < start + len; i++) {
        span.append(terms[i].getBytes(), 0, terms[i].getLength());

        if (i != start + len - 1) {
            span.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }
    }

    return true;
}

From source file:edu.isi.mavuno.util.MavunoUtils.java

License:Apache License

public static final Text createContext(String separator, Text... args) {
    Text context = new Text();

    for (int i = 0; i < args.length; i++) {
        context.append(args[i].getBytes(), 0, args[i].getLength());
        if (i != args.length - 1) {
            context.append(separator.getBytes(), 0, separator.length());
        }/*  w  w  w . j  a va 2  s . co m*/
    }

    return context;
}

From source file:edu.umn.cs.spatialHadoop.core.CSVOGC.java

License:Open Source License

@Override
public Text toText(Text text) {
    if (prefix != null)
        text.append(prefix, 0, prefix.length);
    super.toText(text);
    if (suffix != null)
        text.append(suffix, 0, suffix.length);
    return text;/*from   www.ja v  a2s.c om*/
}

From source file:edu.umn.cs.spatialHadoop.core.GridInfo.java

License:Open Source License

@Override
public Text toText(Text text) {
    final byte[] Comma = ",".getBytes();
    super.toText(text);
    text.append(Comma, 0, Comma.length);
    TextSerializerHelper.serializeLong(columns, text, ',');
    TextSerializerHelper.serializeLong(rows, text, '\0');
    return text;/*w  ww . ja va 2s .c  om*/
}

From source file:edu.umn.cs.spatialHadoop.core.JTSShape.java

License:Apache License

@Override
public Text toText(Text text) {
    //TextSerializerHelper.serializeLong(0, text, '\t');
    String str = WKBWriter.bytesToHex(wkbWriter.write(geom));
    byte[] str_b = str.getBytes();
    text.append(str_b, 0, str_b.length);
    if (extra != null) {
        text.append(Separator, 0, Separator.length);
        byte[] extra_bytes = extra.getBytes();
        text.append(extra_bytes, 0, extra_bytes.length);
    }/*from w ww  . ja v a  2s .c o  m*/
    return text;
}

From source file:edu.umn.cs.spatialHadoop.core.OGCShape.java

License:Apache License

@Override
public Text toText(Text text) {
    String str = bytesToHex(geom.asBinary().array());
    byte[] str_b = str.getBytes();
    text.append(str_b, 0, str_b.length);
    if (extra != null) {
        text.append(Separator, 0, Separator.length);
        byte[] extra_bytes = extra.getBytes();
        text.append(extra_bytes, 0, extra_bytes.length);
    }/*  ww w  .j  a  v a 2s.  c  o m*/
    return text;
}

From source file:edu.umn.cs.spatialHadoop.core.Partition.java

License:Open Source License

@Override
public Text toText(Text text) {
    super.toText(text);
    text.append(new byte[] { ',' }, 0, 1);
    TextSerializerHelper.serializeLong(recordCount, text, ',');
    TextSerializerHelper.serializeLong(size, text, ',');
    byte[] temp = (filename == null ? "" : filename).getBytes();
    text.append(temp, 0, temp.length);//from w ww.jav  a  2  s .com
    return text;
}