List of usage examples for org.apache.hadoop.io Text append
public void append(byte[] utf8, int start, int len)
From source file:org.apache.asterix.external.input.record.reader.hdfs.HDFSTextLineReader.java
License:Apache License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated * line.// w ww . j a va 2 s . co m * * @param str * the object to store the given line (without newline) * @param maxLineLength * the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume * the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * @return the number of bytes read including the (longest) newline * found. * @throws IOException * if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) ++bytesConsumed; //account for CR from previous read bufferLength = reader.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) --readLength; //CR at the end of the buffer bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); currentFilePos = reader.getPos() - bufferLength + bufferPosn; return (int) bytesConsumed; }
From source file:org.apache.ben.QuotationLineReader.java
License:Apache License
/** * Read a line terminated by one of CR, LF, or CRLF. *//*ww w. j a v a 2 s. c o m*/ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR int quoteBefore = 0; boolean previousCharEsc = false; long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) { ++bytesConsumed; //account for CR from previous read } bufferLength = in.read(buffer); if (bufferLength <= 0) { break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == QUOTE) { if (!previousCharEsc) { quoteBefore = (quoteBefore == 0) ? 1 : 0; } } if (buffer[bufferPosn] == LF && (quoteBefore == 0)) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); previousCharEsc = (buffer[bufferPosn] == ESCAPE); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) { --readLength; //CR at the end of the buffer } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:org.apache.ben.QuotationLineReader.java
License:Apache License
/** * Read a line terminated by a custom delimiter. *//*from ww w .java 2 s . com*/ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount = 0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); //appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount = 0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:org.apache.flume.sink.hdfs.HDFSTextFormatter.java
License:Apache License
@Override public byte[] getBytes(Event e) { Text record = makeText(e); record.append("\n".getBytes(), 0, 1); byte[] rawBytes = record.getBytes(); return Arrays.copyOf(rawBytes, record.getLength()); }
From source file:org.apache.gora.accumulo.store.AccumuloStore.java
License:Apache License
Text pad(Text key, int bytes) { if (key.getLength() < bytes) key = new Text(key); while (key.getLength() < bytes) { key.append(new byte[] { 0 }, 0, 1); }//www. j av a 2 s . co m return key; }
From source file:org.apache.jena.tdbloader4.SecondMapper.java
License:Apache License
@Override public void map(LongWritable key, QuadWritable value, Context context) throws IOException, InterruptedException { log.debug("< ({}, {})", key, value); Quad quad = value.getQuad();/* ww w . j ava 2 s. com*/ String s = Utils.serialize(quad.getSubject()); String p = Utils.serialize(quad.getPredicate()); String o = Utils.serialize(quad.getObject()); String g = null; if (!quad.isDefaultGraphGenerated()) { g = Utils.serialize(quad.getGraph()); } // TODO: reuse hash from TDB NodeTableNative? MessageDigest digest = null; try { digest = MessageDigest.getInstance("MD5"); digest.update(s.getBytes("UTF-8")); digest.update(p.getBytes("UTF-8")); digest.update(o.getBytes("UTF-8")); if (g != null) digest.update(g.getBytes("UTF-8")); String hash = new String(Hex.encodeHex(digest.digest())); ht.set(hash); if ((s != null) && (p != null) && (o != null)) { st.set(s); pt.set(p); ot.set(o); Text hs = new Text(ht); hs.append(S, 0, S.length); Text hp = new Text(ht); hp.append(P, 0, P.length); Text ho = new Text(ht); ho.append(O, 0, O.length); emit(context, st, hs); emit(context, pt, hp); emit(context, ot, ho); } if (g != null) { gt.set(g); Text hg = new Text(ht); hg.append(G, 0, G.length); emit(context, gt, hg); EventManager.send(counters, new Event(Constants.eventQuad, quad)); } else { EventManager.send(counters, new Event(Constants.eventTriple, quad.asTriple())); } } catch (Exception e) { throw new TDBLoader4Exception(e); } finally { st.clear(); pt.clear(); ot.clear(); gt.clear(); ht.clear(); } }
From source file:org.apache.rya.indexing.KeyParts.java
License:Apache License
/** * Append any byte array to a row key./*from ww w. j av a2s . c o m*/ * @param bytes append this * @param keyText text to append to */ private static void appendBytes(final byte[] bytes, final Text keyText) { keyText.append(bytes, 0, bytes.length); }
From source file:org.apache.rya.indexing.KeyParts.java
License:Apache License
/** * Get a collision unlikely hash string and append to the key, * so that if two keys have the same value, then they will be the same, * if two different values that occur at the same time there keys are different. * If the application uses a very large number of statements at the exact same time, * the md5 value might be upgraded to for example sha-1 to avoid collisions. * @param statement/*from w w w . jav a 2 s .c o m*/ * @param keyText */ public static void appendUniqueness(final Statement statement, final Text keyText) { keyText.append(HASH_PREFIX, 0, 1); // delimiter final Value statementValue = new Value( StringUtils.getBytesUtf8(StatementSerializer.writeStatement(statement))); final byte[] hashOfValue = Md5Hash.md5Binary(statementValue); keyText.append(hashOfValue, 0, hashOfValue.length); }
From source file:org.apache.tajo.storage.LineReader.java
License:Apache License
/** * Read a line terminated by a custom delimiter. *//*from w w w .ja va 2s . c om*/ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount = 0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); //appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount = 0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Parses a read from an interleaved FASTQ file. * * Only reads a single record.//w w w . j a v a 2 s. c o m * * @param readName Text record containing read name. Output parameter. * @param value Text record containing full record. Output parameter. * @return Returns true if read was successful (did not hit EOF). * * @throws RuntimeException Throws exception if FASTQ record doesn't * have proper formatting (e.g., record doesn't start with @). */ protected final boolean lowLevelFastqRead(final Text readName, final Text value) throws IOException { if (endOfCompressedSplit) { return false; } // ID line readName.clear(); long skipped = appendLineInto(readName, true); if (skipped == 0) { return false; // EOF } if (readName.getBytes()[0] != '@') { throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage() + ". Line: " + readName + ". \n"); } value.append(readName.getBytes(), 0, readName.getLength()); // sequence appendLineInto(value, false); // separator line appendLineInto(value, false); // quality appendLineInto(value, false); return true; }