List of usage examples for org.apache.hadoop.io Text clear
public void clear()
From source file:org.apache.hama.bsp.message.TestMessageIO.java
License:Apache License
public void testSpillInputStream() throws Exception { File f = null;/*from ww w .j a v a2s.c o m*/ try { String fileName = System.getProperty("java.io.tmpdir") + File.separatorChar + "testSpillInputStream.txt"; Configuration conf = new HamaConfiguration(); SpilledDataProcessor processor = new WriteSpilledDataProcessor(fileName); processor.init(conf); SpillingDataOutputBuffer outputBuffer = new SpillingDataOutputBuffer(2, 1024, 1024, true, processor); Text text = new Text("Testing the spillage of spilling buffer"); for (int i = 0; i < 100; ++i) { text.write(outputBuffer); outputBuffer.markRecordEnd(); } assertTrue(outputBuffer != null); assertTrue(outputBuffer.size() == 4000); assertTrue(outputBuffer.hasSpilled()); f = new File(fileName); assertTrue(f.exists()); outputBuffer.close(); assertTrue(f.length() == 4000);// + (4000 / 1024 + 1) * 4)); SpilledDataInputBuffer inputBuffer = outputBuffer.getInputStreamToRead(fileName); for (int i = 0; i < 100; ++i) { text.readFields(inputBuffer); assertTrue("Testing the spillage of spilling buffer".equals(text.toString())); text.clear(); } try { text.readFields(inputBuffer); assertTrue(false); } catch (EOFException eof) { assertTrue(true); } inputBuffer.close(); inputBuffer.completeReading(false); assertTrue(f.exists()); inputBuffer.completeReading(true); assertFalse(f.exists()); } finally { if (f != null) { if (f.exists()) { f.delete(); } } } }
From source file:org.apache.hama.bsp.message.TestSpillingQueue.java
License:Apache License
/** * Test the spilling queue where the message class is not specified and the * queue uses ObjectWritable to store messages. * /* ww w .ja va2 s . co m*/ * @throws Exception */ public void testObjectWritableSpillingQueue() throws Exception { String msg = "Testing the spillage of spilling buffer"; Text text = new Text(msg); TaskAttemptID id = new TaskAttemptID(new TaskID("123", 1, 2), 0); SpillingQueue<Text> queue = new SpillingQueue<Text>(); Configuration conf = new HamaConfiguration(); String fileName = System.getProperty("java.io.tmpdir") + File.separatorChar + new BigInteger(128, new SecureRandom()).toString(32); File file = new File(fileName); conf.set(SpillingQueue.SPILLBUFFER_FILENAME, fileName); queue.init(conf, id); queue.prepareWrite(); for (int i = 0; i < 1000; ++i) { queue.add(text); } queue.prepareRead(); for (Text t : queue) { assertTrue(msg.equals(t.toString())); text.clear(); } assertTrue(queue.poll() == null); assertTrue(file.exists()); queue.close(); assertFalse(file.exists()); }
From source file:org.apache.nifi.processors.hadoop.KeyValueReader.java
License:Apache License
@Override public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem) throws IOException { final SequenceFile.Reader reader; Set<FlowFile> flowFiles = new HashSet<>(); reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file))); final Text key = new Text(); final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader); final String inputfileName = file.getName() + "." + System.nanoTime() + "."; int counter = 0; LOG.debug("Read from SequenceFile: {} ", new Object[] { file }); try {/* ww w . j a va2 s.c o m*/ while (reader.next(key)) { String fileName = key.toString(); // the key may be a file name, and may not if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) { if (fileName.contains(File.separator)) { fileName = StringUtils.substringAfterLast(fileName, File.separator); } fileName = fileName + "." + System.nanoTime(); } else { fileName = inputfileName + ++counter; } FlowFile flowFile = session.create(); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName); callback.key = key; try { flowFile = session.write(flowFile, callback); flowFiles.add(flowFile); } catch (ProcessException e) { LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e); session.remove(flowFile); } key.clear(); } } finally { IOUtils.closeQuietly(reader); } return flowFiles; }
From source file:org.apache.nifi.processors.hadoop.ValueReader.java
License:Apache License
@Override public Set<FlowFile> readSequenceFile(final Path file, Configuration configuration, FileSystem fileSystem) throws IOException { Set<FlowFile> flowFiles = new HashSet<>(); final SequenceFile.Reader reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file))); final String inputfileName = file.getName() + "." + System.nanoTime() + "."; int counter = 0; LOG.debug("Reading from sequence file {}", new Object[] { file }); final OutputStreamWritableCallback writer = new OutputStreamWritableCallback(reader); Text key = new Text(); try {//from ww w . java 2 s .co m while (reader.next(key)) { String fileName = key.toString(); // the key may be a file name, and may not if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) { if (fileName.contains(File.separator)) { fileName = StringUtils.substringAfterLast(fileName, File.separator); } fileName = fileName + "." + System.nanoTime(); } else { fileName = inputfileName + ++counter; } FlowFile flowFile = session.create(); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName); try { flowFile = session.write(flowFile, writer); flowFiles.add(flowFile); } catch (ProcessException e) { LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e); session.remove(flowFile); } key.clear(); } } finally { IOUtils.closeQuietly(reader); } return flowFiles; }
From source file:org.apache.tajo.storage.LineReader.java
License:Apache License
/** * Read a line terminated by a custom delimiter. *///from w w w . ja v a 2 s. c om private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount = 0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); //appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount = 0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Parses a read from an interleaved FASTQ file. * * Only reads a single record.//from w w w . j a v a2 s .co m * * @param readName Text record containing read name. Output parameter. * @param value Text record containing full record. Output parameter. * @return Returns true if read was successful (did not hit EOF). * * @throws RuntimeException Throws exception if FASTQ record doesn't * have proper formatting (e.g., record doesn't start with @). */ protected final boolean lowLevelFastqRead(final Text readName, final Text value) throws IOException { if (endOfCompressedSplit) { return false; } // ID line readName.clear(); long skipped = appendLineInto(readName, true); if (skipped == 0) { return false; // EOF } if (readName.getBytes()[0] != '@') { throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage() + ". Line: " + readName + ". \n"); } value.append(readName.getBytes(), 0, readName.getLength()); // sequence appendLineInto(value, false); // separator line appendLineInto(value, false); // quality appendLineInto(value, false); return true; }
From source file:org.cloudata.core.common.util.CloudataLineReader.java
License:Apache License
/** * Read from the InputStream into the given Text. * /* w w w .ja v a2s. c o m*/ * @param str * the object to store the given line * @return the number of bytes read including the newline * @throws IOException * if the underlying stream throws */ public int readLine(Text str) throws IOException { str.clear(); boolean hadFinalNewline = false; boolean hadFinalReturn = false; boolean hitEndOfFile = false; int startPosn = bufferPosn; outerLoop: while (true) { if (bufferPosn >= bufferLength) { if (!backfill()) { hitEndOfFile = true; break; } } startPosn = bufferPosn; for (; bufferPosn < bufferLength; ++bufferPosn) { switch (buffer[bufferPosn]) { case '\n': hadFinalNewline = true; bufferPosn += 1; break outerLoop; case '\r': if (hadFinalReturn) { // leave this \n in the stream, so we'll get it next time break outerLoop; } hadFinalReturn = true; break; default: if (hadFinalReturn) { break outerLoop; } } } int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0); if (length >= 0) { str.append(buffer, startPosn, length); } } int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 : 0); if (!hitEndOfFile) { int length = bufferPosn - startPosn - newlineLength; if (length > 0) { str.append(buffer, startPosn, length); } } return str.getLength() + newlineLength; }
From source file:org.hypertable.hadoop.mapred.HypertableRecordReader.java
License:Open Source License
private void fill_key(Text key, Key cell_key) { boolean clear = false; /* XXX not sure if "clear" is necessary */ /*/*from w w w.j ava2 s. c o m*/ * !! * If the key format changes, the code which invokes fill_key() * will need to be adjusted because it uses a hard-coded length * of 24 + cell.key.row.length()! */ try { if (m_include_timestamps && cell_key.isSetTimestamp()) { t_timestamp = Long.toString(cell_key.timestamp).getBytes("UTF-8"); clear = true; } if (cell_key.isSetRow()) { t_row = cell_key.row.getBytes("UTF-8"); clear = true; } if (cell_key.isSetColumn_family()) { t_column_family = cell_key.column_family.getBytes("UTF-8"); clear = true; } if (cell_key.isSetColumn_qualifier()) { t_column_qualifier = cell_key.column_qualifier.getBytes("UTF-8"); clear = true; } } catch (UnsupportedEncodingException e) { e.printStackTrace(); System.exit(-1); } if (clear) { key.clear(); if (m_include_timestamps) { key.append(t_timestamp, 0, t_timestamp.length); key.append(tab, 0, tab.length); } key.append(t_row, 0, t_row.length); key.append(tab, 0, tab.length); key.append(t_column_family, 0, t_column_family.length); if (t_column_qualifier.length > 0) { key.append(colon, 0, colon.length); key.append(t_column_qualifier, 0, t_column_qualifier.length); } } }
From source file:org.rassee.omniture.hadoop.util.EscapedLineReader.java
License:Open Source License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF), '\r' (CR), * or '\r\n' (CR+LF). Will ignore any of these termination characters * if they are proceeded by a designated escape character. EOF also * terminates an otherwise unterminated line. * * @param str the object to store the given line (without the newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest will be silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume in * this call. This is only a hint, because if the line crosses this * threshold, we allow it to happen. It can overshoot potentially by * as much as one buffer length. * @return the number of bytes read including the (longest) newline * found//from ww w .j a va 2 s .c o m * @throws IOException */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; // tracks str.getLength() as an optimization int newLineLength = 0; // length of the terminating newline boolean prevCharCR = false; // true if prev char was \r long bytesConsumed = 0; do { int startPos = bufferPos; // starting from where we left off if (bufferPos >= bufferLength) { startPos = bufferPos = 0; if (prevCharCR) ++bytesConsumed; // account for CR from previous read bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPos < bufferLength; ++bufferPos) { boolean escaped = false; if (prevCharCR && bufferPos > 1) escaped = (buffer[bufferPos - 2] == escapeChar); if (!prevCharCR && bufferPos > 0) escaped = (buffer[bufferPos - 1] == escapeChar); if (buffer[bufferPos] == LF && !escaped) { newLineLength = prevCharCR ? 2 : 1; ++bufferPos; // at next loop proceed from following byte break; } if (prevCharCR && !escaped) { // CR + notLF, we are at notLF newLineLength = 1; break; } prevCharCR = (buffer[bufferPos] == CR); //prevCharCR = (buffer[bufferPos] == CR && !escaped); } int readLength = bufferPos - startPos; if (prevCharCR && newLineLength == 0) --readLength; bytesConsumed += readLength; int appendLength = readLength - newLineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPos, appendLength); txtLength += appendLength; } } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int) bytesConsumed; }
From source file:org.springframework.yarn.batch.item.LineReader.java
License:Apache License
/** * Read a line terminated by one of CR, LF, or CRLF. *//*from ww w . j a va 2 s . com*/ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* * We're reading data from in, but the head of the stream may be already * buffered in buffer, so we have several cases: 1. No newline * characters are in the buffer, so we need to copy everything and read * another buffer from the stream. 2. An unambiguously terminated line * is in buffer, so we just copy to str. 3. Ambiguously terminated line * is in buffer, i.e. buffer ends in CR. In this case we copy everything * up to CR to str, but we also need to see what follows CR: if it's LF, * then we need consume LF as well, so next call to readLine will read * from after that. We use a flag prevCharCR to signal if previous * character was CR and, if it happens to be at the end of the buffer, * delay consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization int newlineLength = 0; // length of terminating newline boolean prevCharCR = false; // true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; // starting from where we left off the // last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) { ++bytesConsumed; // account for CR from previous read } bufferLength = in.read(buffer); if (bufferLength <= 0) { break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { // search for // newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following // byte break; } if (prevCharCR) { // CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) { --readLength; // CR at the end of the buffer } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; }