Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void mutability() throws IOException {
        // vv TextTest-Mutability
        Text t = new Text("hadoop");
        t.set("pig");
        assertThat(t.getLength(), is(3));
        assertThat(t.getBytes().length, is(3));
        // ^^ TextTest-Mutability
    }// w  w  w . j a v a  2 s  . c  o m

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void byteArrayNotShortened() throws IOException {
        // vv TextTest-ByteArrayNotShortened
        Text t = new Text("hadoop");
        t.set(/*[*/new Text("pig")/*]*/);
        assertThat(t.getLength(), is(3));
        assertThat("Byte length not shortened", t.getBytes().length, /*[*/is(6)/*]*/);
        // ^^ TextTest-ByteArrayNotShortened
    }// w w  w.  j  a  va2  s  . com

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * /*from  www  .  j a va2  s . c o  m*/
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:diamondmapreduce.NLineRecordReader.java

License:Apache License

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
    if (key == null) {
        key = new LongWritable();
    }//from www  .j a v a2 s .co m
    key.set(pos);
    if (value == null) {
        value = new Text();
    }
    value.clear();
    final Text endline = new Text("\n");
    int newSize = 0;
    for (int i = 0; i < NLINESTOPROCESS; i++) {
        Text v = new Text();
        while (pos < end) {
            newSize = in.readLine(v, maxLineLength,
                    Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
            value.append(v.getBytes(), 0, v.getLength());
            value.append(endline.getBytes(), 0, endline.getLength());
            if (newSize == 0) {
                break;
            }
            pos += newSize;
            if (newSize < maxLineLength) {
                break;
            }
        }
    }
    if (newSize == 0) {
        key = null;
        value = null;
        return false;
    } else {
        return true;
    }
}

From source file:eastcircle.terasort.TotalOrderPartitioner.java

License:Apache License

/**
 * Given a sorted set of cut points, build a trie that will find the correct
 * partition quickly./*  w w w  .  jav a2s.c om*/
 * @param splits the list of cut points
 * @param lower the lower bound of partitions 0..numPartitions-1
 * @param upper the upper bound of partitions 0..numPartitions-1
 * @param prefix the prefix that we have already checked against
 * @param maxDepth the maximum depth we will build a trie for
 * @return the trie node that will divide the splits correctly
 */
private static TrieNode buildTrie(Text[] splits, int lower, int upper, Text prefix, int maxDepth) {
    int depth = prefix.getLength();
    if (depth >= maxDepth || lower == upper) {
        return new LeafTrieNode(depth, splits, lower, upper);
    }
    InnerTrieNode result = new InnerTrieNode(depth);
    Text trial = new Text(prefix);
    // append an extra byte on to the prefix
    trial.append(new byte[1], 0, 1);
    int currentBound = lower;
    for (int ch = 0; ch < 255; ++ch) {
        trial.getBytes()[depth] = (byte) (ch + 1);
        lower = currentBound;
        while (currentBound < upper) {
            if (splits[currentBound].compareTo(trial) >= 0) {
                break;
            }
            currentBound += 1;
        }
        trial.getBytes()[depth] = (byte) ch;
        result.child[ch] = buildTrie(splits, lower, currentBound, trial, maxDepth);
    }
    // pick up the rest
    trial.getBytes()[depth] = (byte) 255;
    result.child[255] = buildTrie(splits, currentBound, upper, trial, maxDepth);
    return result;
}

From source file:edu.cmu.cs.in.hadoop.HoopKeyComparer.java

License:Open Source License

/**
 *
 *//*w  w w.jav  a 2  s.  c o  m*/
public int compare(Text o1, Text o2) {
    return compare(o1.getBytes(), 0, o1.getLength(), o2.getBytes(), 0, o2.getLength());
}

From source file:edu.isi.mavuno.extract.ChunkExtractor.java

License:Apache License

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();/*  ww w . j  av  a2  s.  com*/

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // extract chunks from sentence
    mChunks.clear();
    mChunkTokens.clear();
    List<TratzParsedTokenWritable> tokens = sentence.getTokens();
    Text lastNETag = new Text();
    for (int i = 0; i < tokens.size(); i++) {
        TratzParsedTokenWritable t = tokens.get(i);
        byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0;
        Text neTag = t.getNETag();
        if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0
                || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O'))
                        && (chunkType == 'B' || chunkType == 'O')) {
            if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') {
                Text chunk = createChunk(mChunkTokens, mChunkType);
                mChunks.add(chunk);
            }
            mChunkTokens.clear();
            mChunkType.set(t.getChunkTag());
        }
        mChunkTokens.add(t.getToken());
        lastNETag.set(neTag);
    }

    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') {
        Text chunk = createChunk(mChunkTokens, mChunkType);
        mChunks.add(chunk);
    }

    // generate adjacent (context, pattern) pairs
    for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) {
        Text leftPattern = new Text();
        leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength());
        leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1));

        Text rightPattern = new Text();
        rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength());
        addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1));
    }

    // generate non-adjacent (context, pattern) pairs based on chunks
    for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) {
        for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) {
            if (patternPos - leftSkip - 1 < 0) {
                continue;
            }

            if (mOrContextStyle && !mRightOnlyContextStyle) {
                addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos),
                        ContextPatternWritable.ASTERISK);
            }

            if (mOrContextStyle && mLeftOnlyContextStyle) {
                continue;
            }

            for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) {
                if (patternPos + rightSkip + 1 >= mChunks.size()) {
                    continue;
                }

                // construct (context, pattern) pair
                if (mOrContextStyle) {
                    addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos),
                            mChunks.get(patternPos + rightSkip + 1));
                } else {
                    addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos),
                            mChunks.get(patternPos + rightSkip + 1));
                }
            }
        }
    }

    // get iterator
    mChunkPairsIter = mChunkPairs.iterator();
}

From source file:edu.isi.mavuno.extract.ChunkExtractor.java

License:Apache License

private void addPair(Text left, Text pattern, Text right) {
    ContextPatternWritable c;/*  ww  w . j a v  a  2  s  . c  o m*/

    // forward pattern
    c = new ContextPatternWritable();
    c.setContext(MavunoUtils.createContext(left, right));
    c.setPattern(pattern);

    // add to chunk pairs
    mChunkPairs.add(c);

    // reverse pattern
    c = new ContextPatternWritable();
    c.setContext(MavunoUtils.createContext(right, left));
    c.setPattern(REVERSE_PATTERN);
    c.getPattern().append(pattern.getBytes(), 0, pattern.getLength());

    // add to chunk pairs
    mChunkPairs.add(c);
}

From source file:edu.isi.mavuno.extract.ChunkExtractor.java

License:Apache License

private Text createChunk(List<Text> terms, Text type) {
    Text t = new Text();

    for (int i = 0; i < terms.size(); i++) {
        Text term = terms.get(i);

        t.append(term.getBytes(), 0, term.getLength());

        if (i != terms.size() - 1) {
            t.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }//w w  w . j a  va 2s  .c  o m
    }

    if (t.getLength() > 0 && !mSurfaceForms) {
        t.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        if (type.getLength() > 2) {
            t.append(type.getBytes(), 2, type.getLength() - 2);
        }
    }

    return t;
}

From source file:edu.isi.mavuno.extract.DIRTExtractor.java

License:Apache License

private void loadDependPairs() {
    // clear dependency pairs
    mDependPairs.clear();/* w  ww.  j  a  v  a2s .  co  m*/

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // get sentence tokens
    List<TratzParsedTokenWritable> tokens = sentence.getTokens();

    // get chunk ids
    int[] chunkIds = NLProcTools.getChunkIds(tokens);

    // get mapping from positions to chunks
    Text[] chunks = new Text[tokens.size()];

    Text curChunk = null;
    for (int i = 0; i < tokens.size(); i++) {
        Text text = tokens.get(i).getToken();

        if (i == 0 || (i > 0 && chunkIds[i] != chunkIds[i - 1])) {
            curChunk = new Text(text);
        } else {
            curChunk.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
            curChunk.append(text.getBytes(), 0, text.getLength());
        }

        chunks[i] = curChunk;
    }

    // populate parse tree
    ArrayListOfInts[] children = new ArrayListOfInts[tokens.size() + 1];
    for (int i = 0; i < tokens.size() + 1; i++) {
        children[i] = new ArrayListOfInts();
    }

    for (int i = 0; i < tokens.size(); i++) {
        TratzParsedTokenWritable t = tokens.get(i);

        // ignore punctuation
        if (!t.getDependType().equals(PUNCTUATION_TYPE)) {
            children[t.getDependIndex()].add(i + 1);
        }
    }

    // extract (context, pattern) pairs from parse tree
    for (int i = 0; i < children[0].size(); i++) {
        extractPairs(children, children[0].get(i), tokens, chunks);
    }

    // get iterator
    mDependPairsIter = mDependPairs.iterator();
}