Example usage for org.apache.hadoop.io Text clear

List of usage examples for org.apache.hadoop.io Text clear

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text clear.

Prototype

public void clear() 

Source Link

Document

Clear the string to empty.

Usage

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

/**
 * Creates a proxy ResultCollector that takes as input objects of type T
 * and converts them to objects of type O.
 * It returns an object with a collect method that takes as input an object
 * of type T (i.e., inObj). This object converts the given object to the
 * type O (i.e., outObj) and sends the result to the method in output#collect.
 * @param <O>/*  w w w  .j a v  a 2  s  .  c o m*/
 * @param <T>
 * @param output
 * @param inObj
 * @param outObj
 * @return
 */
private static <O extends TextSerializable, T extends TextSerializable> ResultCollector<T> createConverter(
        final ResultCollector<O> output, T inObj, final O outObj) {
    if (output == null)
        return null;
    if (inObj.getClass() == outObj.getClass()) {
        return new ResultCollector<T>() {
            @Override
            public void collect(T r) {
                output.collect((O) r);
            }
        };
    } else if (inObj instanceof Shape && outObj instanceof Point3d) {
        final Point3d out_pt = (Point3d) outObj;
        return new ResultCollector<T>() {
            @Override
            public void collect(T r) {
                Point3d pt = ((Shape) r).getMBR().getCenterPoint();
                out_pt.x = pt.x;
                out_pt.y = pt.y;
                output.collect(outObj);
            }
        };
    } else if (inObj instanceof Shape && outObj instanceof Prism) {
        final Prism out_rect = (Prism) outObj;
        return new ResultCollector<T>() {
            @Override
            public void collect(T r) {
                out_rect.set((Shape) r);
                output.collect(outObj);
            }
        };
    } else if (outObj instanceof Text) {
        final Text text = (Text) outObj;
        return new ResultCollector<T>() {
            @Override
            public void collect(T r) {
                text.clear();
                r.toText(text);
                output.collect(outObj);
            }
        };
    } else if (inObj instanceof Text) {
        final Text text = (Text) inObj;
        return new ResultCollector<T>() {
            @Override
            public void collect(T r) {
                outObj.fromText(text);
                output.collect(outObj);
            }
        };
    } else {
        throw new RuntimeException("Cannot convert from " + inObj.getClass() + " to " + outObj.getClass());
    }
}

From source file:com.ricemap.spateDB.operations.Tail.java

License:Apache License

/**
 * Reads a maximum of n lines from the stream starting from its current
 * position and going backward./* w w w .j a va2 s  .c  om*/
 * 
 * @param in - An input stream. It'll be scanned from its current position
 *   backward till position 0
 * @param n - Maximum number of lines to return
 * @param stockObject - An object used to deserialize lines read. It can
 *   be set to <code>null</code> if output is also <code>null</code>. In this
 *   case, nothing is reported to the output.
 * @param output - An output collector used to report lines read.
 * @return - The position of the beginning of the earliest line read from
 *   buffer.
 * @throws IOException
 */
public static <T extends TextSerializable> long tail(FSDataInputStream in, int n, T stockObject,
        ResultCollector<T> output) throws IOException {
    int lines_read = 0;
    long end = in.getPos();
    long offset_of_last_eol = end;
    long last_read_byte = end;

    LongWritable line_offset = new LongWritable();
    Text read_line = new Text();
    Text remainder_from_last_buffer = new Text();
    byte[] buffer = new byte[4096];

    while (last_read_byte > 0 && lines_read < n) {
        // Read next chunk from the back
        long first_byte_to_read = (last_read_byte - 1) - (last_read_byte - 1) % buffer.length;
        in.seek(first_byte_to_read);
        int bytes_to_read = (int) (last_read_byte - first_byte_to_read);
        in.read(buffer, 0, bytes_to_read);
        last_read_byte = first_byte_to_read;

        // Iterate over bytes in this buffer
        int i_last_byte_consumed_in_buffer = bytes_to_read;
        int i_last_byte_examined_in_buffer = bytes_to_read;
        while (i_last_byte_examined_in_buffer > 0 && lines_read < n) {
            byte byte_examined = buffer[--i_last_byte_examined_in_buffer];
            if (byte_examined == '\n' || byte_examined == '\r') {
                // Found an end of line character
                // Report this to output unless it's empty
                long offset_of_this_eol = first_byte_to_read + i_last_byte_examined_in_buffer;
                if (offset_of_last_eol - offset_of_this_eol > 1) {
                    if (output != null) {
                        read_line.clear();
                        // +1 is to skip the EOL at the beginning
                        read_line.append(buffer, i_last_byte_examined_in_buffer + 1,
                                i_last_byte_consumed_in_buffer - (i_last_byte_examined_in_buffer + 1));
                        // Also append bytes remaining from last buffer
                        if (remainder_from_last_buffer.getLength() > 0) {
                            read_line.append(remainder_from_last_buffer.getBytes(), 0,
                                    remainder_from_last_buffer.getLength());
                        }
                        line_offset.set(offset_of_this_eol + 1);
                        stockObject.fromText(read_line);
                        output.collect(stockObject);
                    }
                    lines_read++;
                    remainder_from_last_buffer.clear();
                }
                i_last_byte_consumed_in_buffer = i_last_byte_examined_in_buffer;
                offset_of_last_eol = offset_of_this_eol;
            }
        }
        if (i_last_byte_consumed_in_buffer > 0) {
            // There are still some bytes not consumed in buffer
            if (remainder_from_last_buffer.getLength() == 0) {
                // Store whatever is remaining in remainder_from_last_buffer
                remainder_from_last_buffer.append(buffer, 0, i_last_byte_consumed_in_buffer);
            } else {
                // Prepend remaining bytes to Text
                Text t = new Text();
                t.append(buffer, 0, i_last_byte_consumed_in_buffer);
                t.append(remainder_from_last_buffer.getBytes(), 0, remainder_from_last_buffer.getLength());
                remainder_from_last_buffer = t;
            }
        }
    }

    if (lines_read < n && remainder_from_last_buffer.getLength() > 0) {
        // There is still one last line needs to be reported
        lines_read++;
        if (output != null) {
            read_line = remainder_from_last_buffer;
            line_offset.set(0);
            stockObject.fromText(read_line);
            output.collect(stockObject);
        }
        offset_of_last_eol = -1;
    }

    return offset_of_last_eol + 1;
}

From source file:com.tgam.hadoop.util.GenericEscapedLineReader.java

License:Apache License

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line.//  ww  w . j av  a 2s. co m
 *
 * @param str the object to store the given line (without newline)
 * @param maxLineLength the maximum number of bytes to store into str;
 *  the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *  in this call.  This is only a hint, because if the line cross
 *  this threshold, we allow it to happen.  It can overshoot
 *  potentially by as much as one buffer length.
 *
 * @return the number of bytes read including the (longest) newline
 * found.
 *
 * @throws IOException if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
    * already buffered in buffer, so we have several cases:
    * 1. No newline characters are in the buffer, so we need to copy
    *    everything and read another buffer from the stream.
    * 2. An unambiguously terminated line is in buffer, so we just
    *    copy to str.
    * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
    *    in CR.  In this case we copy everything up to CR to str, but
    *    we also need to see what follows CR: if it's LF, then we
    *    need consume LF as well, so next call to readLine will read
    *    from after that.
    * We use a flag prevCharCR to signal if previous character was CR
    * and, if it happens to be at the end of the buffer, delay
    * consuming it until we have a chance to look at the char that
    * follows.
    */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    boolean prevCharEscape = false;
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR)
                ++bytesConsumed; //account for CR from previous read
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == LF && !prevCharEscape) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }

            prevCharCR = (buffer[bufferPosn] == CR && !prevCharEscape);
            prevCharEscape = (buffer[bufferPosn] == ESCAPE);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0)
            --readLength; //CR at the end of the buffer
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}

From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.EscapedLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text. A line
 * can be terminated by one of the following: '\n' (LF), '\r' (CR),
 * or '\r\n' (CR+LF).  Will ignore any of these termination characters
 * if they are proceeded by a designated escape character. EOF also
 * terminates an otherwise unterminated line.
 *
 * @param str               the object to store the given line (without the newline)
 * @param maxLineLength     the maximum number of bytes to store into str; the rest will be silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume in this call.  This is only a hint, because if the line crosses this threshold, we allow it to happen.  It can overshoot
 *                          potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline found
 *///from  w  w w. ja va 2s . com
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
    * 1. No newline characters are in the buffer, so we need to copy
    *    everything and read another buffer from the stream.
    * 2. An unambiguously terminated line is in buffer, so we just
    *    copy to str.
    * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
    *    in CR.  In this case we copy everything up to CR to str, but
    *    we also need to see what follows CR: if it's LF, then we
    *    need consume LF as well, so next call to readLine will read
    *    from after that.
    * We use a flag prevCharCR to signal if previous character was CR
    * and, if it happens to be at the end of the buffer, delay
    * consuming it until we have a chance to look at the char that
    * follows.
    */
    str.clear();
    int txtLength = 0; // tracks str.getLength() as an optimization
    int newLineLength = 0; // length of the terminating newline
    boolean prevCharCR = false; // true if prev char was \r
    long bytesConsumed = 0;

    do {
        int startPos = bufferPos; // starting from where we left off
        if (bufferPos >= bufferLength) {
            startPos = bufferPos = 0;
            if (prevCharCR) {
                ++bytesConsumed; // account for CR from previous read
            }
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPos < bufferLength; ++bufferPos) {
            boolean escaped = false;
            if (prevCharCR && bufferPos > 1) {
                escaped = (buffer[bufferPos - 2] == escapeChar);
            }
            if (!prevCharCR && bufferPos > 0) {
                escaped = (buffer[bufferPos - 1] == escapeChar);
            }

            if (buffer[bufferPos] == LF && !escaped) {
                newLineLength = prevCharCR ? 2 : 1;
                ++bufferPos; // at next loop proceed from following byte
                break;
            }
            if (prevCharCR && !escaped) { // CR + notLF, we are at notLF
                newLineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPos] == CR);
            //prevCharCR = (buffer[bufferPos] == CR && !escaped);
        }
        int readLength = bufferPos - startPos;
        if (prevCharCR && newLineLength == 0) {
            --readLength;
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newLineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPos, appendLength);
            txtLength += appendLength;
        }
    } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }

    return (int) bytesConsumed;
}

From source file:cosmos.mapred.LfLineReader.java

License:Apache License

/**
 * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line.
 * /*from w w  w.  j  av a2s  .c  o  m*/
 * @param str
 *          the object to store the given line (without newline)
 * @param maxLineLength
 *          the maximum number of bytes to store into str; the rest of the line is silently discarded.
 * @param maxBytesToConsume
 *          the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It can
 *          overshoot potentially by as much as one buffer length.
 * 
 * @return the number of bytes read including the (longest) newline found.
 * 
 * @throws IOException
 *           if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /*
     * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in the
     * buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy to str.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    int newlineLength = 0; // length of terminating newline
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; // starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * //  ww w.  ja v a 2s  .c  o  m
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:edu.isi.mavuno.extract.CooccurExtractor.java

License:Apache License

protected boolean getPattern(Text pattern, Text[] terms, int start, int len) {
    if (start < 0 || start + len > terms.length) {
        return false;
    }//from  w  ww.  ja v  a  2  s .  com

    pattern.clear();
    for (int i = start; i < start + len; i++) {
        pattern.append(terms[i].getBytes(), 0, terms[i].getLength());
        if (i != start + len - 1) {
            pattern.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }
    }

    return true;
}

From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java

License:Apache License

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();/*  w w  w.  j a va 2  s .c o m*/

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // get chunk ids
    List<TratzParsedTokenWritable> sentenceTokens = sentence.getTokens();
    int[] chunkIds = NLProcTools.getChunkIds(sentenceTokens);

    mChunks.clear();
    mChunkTokens.clear();

    // extract chunks from sentence
    for (int i = 0; i < chunkIds.length; i++) {
        if (i > 0 && chunkIds[i] != chunkIds[i - 1]) {
            Chunk chunk = createChunk(mChunkTokens);
            mChunks.add(chunk);
            mChunkTokens.clear();
        }
        mChunkTokens.add(sentenceTokens.get(i));
    }

    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) {
        Chunk chunk = createChunk(mChunkTokens);
        mChunks.add(chunk);
    }

    // there's nothing we can do if there aren't at least mArity chunks in the sentence
    if (mArity > mChunks.size()) {
        mChunkPairsIter = mChunkPairs.iterator();
        return;
    }

    // initialize context positions
    for (int i = 0; i < mArity; i++) {
        mContextPositions[i] = i;
    }

    // initialize pattern positions
    for (int i = 0; i < mArity - 1; i++) {
        mPatternPositions[i] = i;
    }

    // generate (context, pattern) pairs based on chunks
    final Text basePattern = new Text();
    while (true) {
        // construct context
        for (int i = 0; i < mArity; i++) {
            mContextChunks[i] = mChunks.get(mContextPositions[i]);
        }

        // construct pattern
        for (int i = 0; i < mArity - 1; i++) {
            mPatternChunks[i] = mChunks.get(mPatternPositions[i]);
        }

        // add (context, pattern) pair
        basePattern.clear();
        for (int i = 0; i < mArity - 1; i++) {
            // left chunk type
            basePattern.append(mContextChunks[i].type.getBytes(), 0, mContextChunks[i].type.getLength());
            basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);

            if (mContextPositions[i + 1] - mPatternPositions[i] > 1
                    || mPatternPositions[i] - mContextPositions[i] > 1) {
                if (mPatternPositions[i] == mContextPositions[i]) {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                } else if (mPatternPositions[i] == mContextPositions[i] + 1) {
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                } else if (mPatternPositions[i] + 1 == mContextPositions[i + 1]) {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                } else {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0,
                            mPatternChunks[i].text.getLength());
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                }
            } else if (mPatternPositions[i] == mContextPositions[i]) {
                basePattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
            } else {
                basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
            }
            basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        }

        // last chunk type
        basePattern.append(mContextChunks[mArity - 1].type.getBytes(), 0,
                mContextChunks[mArity - 1].type.getLength());
        basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);

        int[] indices;
        mPermGen.reset();
        while (mPermGen.hasMore()) {
            // get next permutation
            indices = mPermGen.getNext();

            ContextPatternWritable c = new ContextPatternWritable();

            // pattern
            c.setPattern(basePattern);
            Text numLeftText = new Text(mPermGen.getNumLeft() + "/" + mArity);
            c.getPattern().append(numLeftText.getBytes(), 0, numLeftText.getLength());

            // context
            c.getContext().clear();
            for (int i = 0; i < mArity; i++) {
                c.getContext().append(mContextChunks[indices[i]].text.getBytes(), 0,
                        mContextChunks[indices[i]].text.getLength());
                if (i != mArity - 1) {
                    c.getContext().append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                }
            }

            // add to chunk pairs
            mChunkPairs.add(c);
        }

        // get next set of context and pattern positions
        int pos = mArity - 2;
        while (pos >= 0) {
            if (mPatternPositions[pos] + 1 < mChunks.size()
                    && mPatternPositions[pos] + 1 < mContextPositions[pos + 1]) {
                mPatternPositions[pos]++;
                for (int i = pos + 1; i < mArity - 2; i++) {
                    mPatternPositions[i] = mContextPositions[i];
                }
                break;
            }
            pos--;
        }

        // update the context positions if the pattern positions can't be updated any further
        if (pos < 0) {
            pos = mArity - 1;
            while (pos >= 0) {
                if (mContextPositions[pos] + 1 < mChunks.size()
                        && (pos + 1 >= mArity || mContextPositions[pos + 1] - (mContextPositions[pos] + 1) >= 1)
                        && (pos <= 0
                                || mContextPositions[pos] - mContextPositions[pos - 1] - 1 <= mMaxSkipSize)) {
                    mContextPositions[pos]++;
                    if (pos < mArity - 1) {
                        mPatternPositions[pos] = mContextPositions[pos];
                    }

                    for (int i = pos + 1; i < mArity; i++) {
                        mContextPositions[i] = mContextPositions[pos] + (i - pos);
                        if (i < mArity - 1) {
                            mPatternPositions[i] = mContextPositions[i];
                        }
                    }

                    break;
                }
                pos--;
            }

            // if neither the context nor the pattern positions can be updated then we're done
            if (pos < 0) {
                // get iterator
                mChunkPairsIter = mChunkPairs.iterator();
                return;
            }
        }
    }
}

From source file:edu.isi.mavuno.extract.NGramExtractor.java

License:Apache License

private boolean getSpan(Text span, Text[] terms, int start, int len) {
    if (start < 0 || start + len > terms.length) {
        return false;
    }//from  w  ww .j  av  a 2  s.c  o  m

    span.clear();
    for (int i = start; i < start + len; i++) {
        span.append(terms[i].getBytes(), 0, terms[i].getLength());

        if (i != start + len - 1) {
            span.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }
    }

    return true;
}

From source file:edu.isi.mavuno.util.TokenWritable.java

License:Apache License

protected static void safeSet(Text t, Text s) {
    if (s == null) {
        t.clear();
    } else {//from w  w w.j a v  a2 s.  com
        t.set(s);
    }
}