List of usage examples for org.apache.hadoop.io Text clear
public void clear()
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
/** * Creates a proxy ResultCollector that takes as input objects of type T * and converts them to objects of type O. * It returns an object with a collect method that takes as input an object * of type T (i.e., inObj). This object converts the given object to the * type O (i.e., outObj) and sends the result to the method in output#collect. * @param <O>/* w w w .j a v a 2 s . c o m*/ * @param <T> * @param output * @param inObj * @param outObj * @return */ private static <O extends TextSerializable, T extends TextSerializable> ResultCollector<T> createConverter( final ResultCollector<O> output, T inObj, final O outObj) { if (output == null) return null; if (inObj.getClass() == outObj.getClass()) { return new ResultCollector<T>() { @Override public void collect(T r) { output.collect((O) r); } }; } else if (inObj instanceof Shape && outObj instanceof Point3d) { final Point3d out_pt = (Point3d) outObj; return new ResultCollector<T>() { @Override public void collect(T r) { Point3d pt = ((Shape) r).getMBR().getCenterPoint(); out_pt.x = pt.x; out_pt.y = pt.y; output.collect(outObj); } }; } else if (inObj instanceof Shape && outObj instanceof Prism) { final Prism out_rect = (Prism) outObj; return new ResultCollector<T>() { @Override public void collect(T r) { out_rect.set((Shape) r); output.collect(outObj); } }; } else if (outObj instanceof Text) { final Text text = (Text) outObj; return new ResultCollector<T>() { @Override public void collect(T r) { text.clear(); r.toText(text); output.collect(outObj); } }; } else if (inObj instanceof Text) { final Text text = (Text) inObj; return new ResultCollector<T>() { @Override public void collect(T r) { outObj.fromText(text); output.collect(outObj); } }; } else { throw new RuntimeException("Cannot convert from " + inObj.getClass() + " to " + outObj.getClass()); } }
From source file:com.ricemap.spateDB.operations.Tail.java
License:Apache License
/** * Reads a maximum of n lines from the stream starting from its current * position and going backward./* w w w .j a va2 s .c om*/ * * @param in - An input stream. It'll be scanned from its current position * backward till position 0 * @param n - Maximum number of lines to return * @param stockObject - An object used to deserialize lines read. It can * be set to <code>null</code> if output is also <code>null</code>. In this * case, nothing is reported to the output. * @param output - An output collector used to report lines read. * @return - The position of the beginning of the earliest line read from * buffer. * @throws IOException */ public static <T extends TextSerializable> long tail(FSDataInputStream in, int n, T stockObject, ResultCollector<T> output) throws IOException { int lines_read = 0; long end = in.getPos(); long offset_of_last_eol = end; long last_read_byte = end; LongWritable line_offset = new LongWritable(); Text read_line = new Text(); Text remainder_from_last_buffer = new Text(); byte[] buffer = new byte[4096]; while (last_read_byte > 0 && lines_read < n) { // Read next chunk from the back long first_byte_to_read = (last_read_byte - 1) - (last_read_byte - 1) % buffer.length; in.seek(first_byte_to_read); int bytes_to_read = (int) (last_read_byte - first_byte_to_read); in.read(buffer, 0, bytes_to_read); last_read_byte = first_byte_to_read; // Iterate over bytes in this buffer int i_last_byte_consumed_in_buffer = bytes_to_read; int i_last_byte_examined_in_buffer = bytes_to_read; while (i_last_byte_examined_in_buffer > 0 && lines_read < n) { byte byte_examined = buffer[--i_last_byte_examined_in_buffer]; if (byte_examined == '\n' || byte_examined == '\r') { // Found an end of line character // Report this to output unless it's empty long offset_of_this_eol = first_byte_to_read + i_last_byte_examined_in_buffer; if (offset_of_last_eol - offset_of_this_eol > 1) { if (output != null) { read_line.clear(); // +1 is to skip the EOL at the beginning read_line.append(buffer, i_last_byte_examined_in_buffer + 1, i_last_byte_consumed_in_buffer - (i_last_byte_examined_in_buffer + 1)); // Also append bytes remaining from last buffer if (remainder_from_last_buffer.getLength() > 0) { read_line.append(remainder_from_last_buffer.getBytes(), 0, remainder_from_last_buffer.getLength()); } line_offset.set(offset_of_this_eol + 1); stockObject.fromText(read_line); output.collect(stockObject); } lines_read++; remainder_from_last_buffer.clear(); } i_last_byte_consumed_in_buffer = i_last_byte_examined_in_buffer; offset_of_last_eol = offset_of_this_eol; } } if (i_last_byte_consumed_in_buffer > 0) { // There are still some bytes not consumed in buffer if (remainder_from_last_buffer.getLength() == 0) { // Store whatever is remaining in remainder_from_last_buffer remainder_from_last_buffer.append(buffer, 0, i_last_byte_consumed_in_buffer); } else { // Prepend remaining bytes to Text Text t = new Text(); t.append(buffer, 0, i_last_byte_consumed_in_buffer); t.append(remainder_from_last_buffer.getBytes(), 0, remainder_from_last_buffer.getLength()); remainder_from_last_buffer = t; } } } if (lines_read < n && remainder_from_last_buffer.getLength() > 0) { // There is still one last line needs to be reported lines_read++; if (output != null) { read_line = remainder_from_last_buffer; line_offset.set(0); stockObject.fromText(read_line); output.collect(stockObject); } offset_of_last_eol = -1; } return offset_of_last_eol + 1; }
From source file:com.tgam.hadoop.util.GenericEscapedLineReader.java
License:Apache License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated * line.// ww w . j av a 2s. co m * * @param str the object to store the given line (without newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * * @return the number of bytes read including the (longest) newline * found. * * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR boolean prevCharEscape = false; long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) ++bytesConsumed; //account for CR from previous read bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF && !prevCharEscape) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR && !prevCharEscape); prevCharEscape = (buffer[bufferPosn] == ESCAPE); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) --readLength; //CR at the end of the buffer bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int) bytesConsumed; }
From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.EscapedLineReader.java
License:Open Source License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF), '\r' (CR), * or '\r\n' (CR+LF). Will ignore any of these termination characters * if they are proceeded by a designated escape character. EOF also * terminates an otherwise unterminated line. * * @param str the object to store the given line (without the newline) * @param maxLineLength the maximum number of bytes to store into str; the rest will be silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume in this call. This is only a hint, because if the line crosses this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * @return the number of bytes read including the (longest) newline found *///from w w w. ja va 2s . com public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; // tracks str.getLength() as an optimization int newLineLength = 0; // length of the terminating newline boolean prevCharCR = false; // true if prev char was \r long bytesConsumed = 0; do { int startPos = bufferPos; // starting from where we left off if (bufferPos >= bufferLength) { startPos = bufferPos = 0; if (prevCharCR) { ++bytesConsumed; // account for CR from previous read } bufferLength = in.read(buffer); if (bufferLength <= 0) { break; // EOF } } for (; bufferPos < bufferLength; ++bufferPos) { boolean escaped = false; if (prevCharCR && bufferPos > 1) { escaped = (buffer[bufferPos - 2] == escapeChar); } if (!prevCharCR && bufferPos > 0) { escaped = (buffer[bufferPos - 1] == escapeChar); } if (buffer[bufferPos] == LF && !escaped) { newLineLength = prevCharCR ? 2 : 1; ++bufferPos; // at next loop proceed from following byte break; } if (prevCharCR && !escaped) { // CR + notLF, we are at notLF newLineLength = 1; break; } prevCharCR = (buffer[bufferPos] == CR); //prevCharCR = (buffer[bufferPos] == CR && !escaped); } int readLength = bufferPos - startPos; if (prevCharCR && newLineLength == 0) { --readLength; } bytesConsumed += readLength; int appendLength = readLength - newLineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPos, appendLength); txtLength += appendLength; } } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:cosmos.mapred.LfLineReader.java
License:Apache License
/** * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line. * /*from w w w. j av a2s .c o m*/ * @param str * the object to store the given line (without newline) * @param maxLineLength * the maximum number of bytes to store into str; the rest of the line is silently discarded. * @param maxBytesToConsume * the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It can * overshoot potentially by as much as one buffer length. * * @return the number of bytes read including the (longest) newline found. * * @throws IOException * if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in the * buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy to str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization int newlineLength = 0; // length of terminating newline long bytesConsumed = 0; do { int startPosn = bufferPosn; // starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline if (buffer[bufferPosn] == LF) { newlineLength = 1; ++bufferPosn; // at next invocation proceed from following byte break; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int) bytesConsumed; }
From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
License:Apache License
/** * Reading a bunch of lines of file paths in a list. * The code in this method is redistributed from Hadoop LineRecordReader * // ww w. ja v a 2s .c o m * @throws IOException */ private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); long begin = fileSplit.getStart(); long end = begin + fileSplit.getLength(); LOG.info("Reading paths in file " + path.getName()); // First check the compression codec CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodec.getCodec(path); FSDataInputStream fis = fs.open(path); SplitLineReader in; Seekable filePosition; boolean compressed = false; Decompressor decompressor = null; if (null != codec) { compressed = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis, decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, conf, (byte[]) null); begin = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null); filePosition = fis; } } else { fis.seek(begin); in = new SplitLineReader(fis, conf, (byte[]) null); filePosition = fis; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (begin != 0) { begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end)); } long pos = begin; int newSize = 0; final Text nextLine = new Text(); paths = new ArrayList<>(); while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) { if (pos == 0) { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE); pos += newSize; int textLength = nextLine.getLength(); byte[] textBytes = nextLine.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and // not do the copyBytes textBytes = nextLine.copyBytes(); nextLine.set(textBytes, 3, textLength); } else { nextLine.clear(); } } } else { newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end)); pos += newSize; } paths.add(nextLine.toString()); // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } try { if (in != null) { in.close(); } if (fis != null) { fis.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:edu.isi.mavuno.extract.CooccurExtractor.java
License:Apache License
protected boolean getPattern(Text pattern, Text[] terms, int start, int len) { if (start < 0 || start + len > terms.length) { return false; }//from w ww. ja v a 2 s . com pattern.clear(); for (int i = start; i < start + len; i++) { pattern.append(terms[i].getBytes(), 0, terms[i].getLength()); if (i != start + len - 1) { pattern.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); } } return true; }
From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java
License:Apache License
private void loadChunkPairs() { // clear chunk pairs mChunkPairs.clear();/* w w w. j a va 2 s .c o m*/ // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // get chunk ids List<TratzParsedTokenWritable> sentenceTokens = sentence.getTokens(); int[] chunkIds = NLProcTools.getChunkIds(sentenceTokens); mChunks.clear(); mChunkTokens.clear(); // extract chunks from sentence for (int i = 0; i < chunkIds.length; i++) { if (i > 0 && chunkIds[i] != chunkIds[i - 1]) { Chunk chunk = createChunk(mChunkTokens); mChunks.add(chunk); mChunkTokens.clear(); } mChunkTokens.add(sentenceTokens.get(i)); } // handle last chunk in sentence if (mChunkTokens.size() > 0) { Chunk chunk = createChunk(mChunkTokens); mChunks.add(chunk); } // there's nothing we can do if there aren't at least mArity chunks in the sentence if (mArity > mChunks.size()) { mChunkPairsIter = mChunkPairs.iterator(); return; } // initialize context positions for (int i = 0; i < mArity; i++) { mContextPositions[i] = i; } // initialize pattern positions for (int i = 0; i < mArity - 1; i++) { mPatternPositions[i] = i; } // generate (context, pattern) pairs based on chunks final Text basePattern = new Text(); while (true) { // construct context for (int i = 0; i < mArity; i++) { mContextChunks[i] = mChunks.get(mContextPositions[i]); } // construct pattern for (int i = 0; i < mArity - 1; i++) { mPatternChunks[i] = mChunks.get(mPatternPositions[i]); } // add (context, pattern) pair basePattern.clear(); for (int i = 0; i < mArity - 1; i++) { // left chunk type basePattern.append(mContextChunks[i].type.getBytes(), 0, mContextChunks[i].type.getLength()); basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); if (mContextPositions[i + 1] - mPatternPositions[i] > 1 || mPatternPositions[i] - mContextPositions[i] > 1) { if (mPatternPositions[i] == mContextPositions[i]) { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } else if (mPatternPositions[i] == mContextPositions[i] + 1) { basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } else if (mPatternPositions[i] + 1 == mContextPositions[i + 1]) { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); } else { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } } else if (mPatternPositions[i] == mContextPositions[i]) { basePattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); } else { basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); } basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } // last chunk type basePattern.append(mContextChunks[mArity - 1].type.getBytes(), 0, mContextChunks[mArity - 1].type.getLength()); basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); int[] indices; mPermGen.reset(); while (mPermGen.hasMore()) { // get next permutation indices = mPermGen.getNext(); ContextPatternWritable c = new ContextPatternWritable(); // pattern c.setPattern(basePattern); Text numLeftText = new Text(mPermGen.getNumLeft() + "/" + mArity); c.getPattern().append(numLeftText.getBytes(), 0, numLeftText.getLength()); // context c.getContext().clear(); for (int i = 0; i < mArity; i++) { c.getContext().append(mContextChunks[indices[i]].text.getBytes(), 0, mContextChunks[indices[i]].text.getLength()); if (i != mArity - 1) { c.getContext().append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } } // add to chunk pairs mChunkPairs.add(c); } // get next set of context and pattern positions int pos = mArity - 2; while (pos >= 0) { if (mPatternPositions[pos] + 1 < mChunks.size() && mPatternPositions[pos] + 1 < mContextPositions[pos + 1]) { mPatternPositions[pos]++; for (int i = pos + 1; i < mArity - 2; i++) { mPatternPositions[i] = mContextPositions[i]; } break; } pos--; } // update the context positions if the pattern positions can't be updated any further if (pos < 0) { pos = mArity - 1; while (pos >= 0) { if (mContextPositions[pos] + 1 < mChunks.size() && (pos + 1 >= mArity || mContextPositions[pos + 1] - (mContextPositions[pos] + 1) >= 1) && (pos <= 0 || mContextPositions[pos] - mContextPositions[pos - 1] - 1 <= mMaxSkipSize)) { mContextPositions[pos]++; if (pos < mArity - 1) { mPatternPositions[pos] = mContextPositions[pos]; } for (int i = pos + 1; i < mArity; i++) { mContextPositions[i] = mContextPositions[pos] + (i - pos); if (i < mArity - 1) { mPatternPositions[i] = mContextPositions[i]; } } break; } pos--; } // if neither the context nor the pattern positions can be updated then we're done if (pos < 0) { // get iterator mChunkPairsIter = mChunkPairs.iterator(); return; } } } }
From source file:edu.isi.mavuno.extract.NGramExtractor.java
License:Apache License
private boolean getSpan(Text span, Text[] terms, int start, int len) { if (start < 0 || start + len > terms.length) { return false; }//from w ww .j av a 2 s.c o m span.clear(); for (int i = start; i < start + len; i++) { span.append(terms[i].getBytes(), 0, terms[i].getLength()); if (i != start + len - 1) { span.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); } } return true; }
From source file:edu.isi.mavuno.util.TokenWritable.java
License:Apache License
protected static void safeSet(Text t, Text s) { if (s == null) { t.clear(); } else {//from w w w.j a v a2 s. com t.set(s); } }