Example usage for org.apache.hadoop.io Text getBytes

List of usage examples for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:org.apache.pig.impl.util.StorageUtil.java

License:Apache License

/**
 * Transform a line of <code>Text</code> to a <code>Tuple</code>
 *
 * @param val a line of text/* w  ww  .  j ava 2  s  . c om*/
 * @param fieldDel the field delimiter
 * @return tuple constructed from the text
 */
public static Tuple textToTuple(Text val, byte fieldDel) {
    return bytesToTuple(val.getBytes(), 0, val.getLength(), fieldDel);
}

From source file:org.apache.pig.piggybank.storage.CSVExcelStorage.java

License:Apache License

@Override
public Tuple getNext() throws IOException {
    // If SKIP_INPUT_HEADER and this is the first input split, skip header record
    // We store its value as a string though, so we can compare
    // further records to it. If they are the same (this would 
    // happen if multiple small files each with a header were combined
    // into one split), we know to skip the duplicate header record as well.
    if (loadingFirstRecord && headerTreatment == Headers.SKIP_INPUT_HEADER
            && (splitIndex == 0 || splitIndex == -1)) {
        try {//from   w  w w .  j  av a  2 s . c  o  m
            if (!in.nextKeyValue())
                return null;
            header = ((Text) in.getCurrentValue()).toString();
        } catch (InterruptedException e) {
            int errCode = 6018;
            String errMsg = "Error while reading input";
            throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
        }
    }
    loadingFirstRecord = false;

    mProtoTuple = new ArrayList<Object>();

    getNextInQuotedField = false;
    boolean evenQuotesSeen = true;
    boolean sawEmbeddedRecordDelimiter = false;
    byte[] buf = null;

    if (!mRequiredColumnsInitialized) {
        if (udfContextSignature != null) {
            Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
            mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature));
        }
        mRequiredColumnsInitialized = true;
    }
    // Note: we cannot factor out the check for nextKeyValue() being null,
    // because that call overwrites buf with the new line, which is
    // bad if we have a field with a newline.

    try {
        int recordLen = 0;
        getNextFieldID = 0;

        while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) {
            Text value = null;
            if (sawEmbeddedRecordDelimiter) {

                // Deal with pulling more records from the input, because
                // a double quoted embedded newline was encountered in a field.
                // Save the length of the record so far, plus one byte for the 
                // record delimiter (usually newline) that's embedded in the field 
                // we were working on before falling into this branch:
                int prevLineLen = recordLen + 1;

                // Save previous line (the one with the field that has the newline) in a new array.
                // The last byte will be random; we'll fill in the embedded
                // record delimiter (usually newline) below:
                byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen);
                prevLineSaved[prevLineLen - 1] = RECORD_DEL;

                // Read the continuation of the record, unless EOF:
                if (!in.nextKeyValue()) {
                    return null;
                }
                value = (Text) in.getCurrentValue();
                recordLen = value.getLength();
                // Grab the continuation's bytes:
                buf = value.getBytes();

                // Combine the previous line and the continuation into a new array.
                // The following copyOf() does half the job: it allocates all the
                // space, and also copies the previous line into that space:
                byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen);

                // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos, lengthToCopy:
                System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen);

                // We'll work with the combination now:
                buf = prevLineAndContinuation;

                // Do the whole record over from the start:
                mProtoTuple.clear();
                getNextInQuotedField = false;
                evenQuotesSeen = true;
                getNextFieldID = 0;
                recordLen = prevLineAndContinuation.length;

            } else {
                // Previous record finished cleanly: start with the next record,
                // unless EOF:
                if (!in.nextKeyValue()) {
                    return null;
                }
                value = (Text) in.getCurrentValue();

                // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it
                // (this might happen if multiple files each with a header are combined into a single split)
                if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) {
                    if (!in.nextKeyValue())
                        return null;
                    value = (Text) in.getCurrentValue();
                }

                buf = value.getBytes();
                getNextFieldID = 0;
                recordLen = value.getLength();
            }

            nextTupleSkipChar = false;

            ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen);

            sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer);

            // The last field is never delimited by a FIELD_DEL, but by
            // the end of the record. So we need to add that last field.
            // The '!sawEmbeddedRecordDelimiter' handles the case of
            // embedded newlines; we are amidst a field, not at
            // the final record:
            if (!sawEmbeddedRecordDelimiter)
                readField(fieldBuffer, getNextFieldID++);
        } // end while

    } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }

    Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
    return t;
}

From source file:org.apache.rya.accumulo.query.RangeBindingSetEntries.java

License:Apache License

/**
 * //  w  ww .  j a  va2  s .  c  om
 * @param colFamily
 * @param startColFamily
 * @param stopColFamily
 * @return true if colFamily lies between startColFamily and stopColFamily
 */
private boolean validateContext(Text colFamily, Text startColFamily, Text stopColFamily) {
    byte[] cfBytes = colFamily.getBytes();
    byte[] start = startColFamily.getBytes();
    byte[] stop = stopColFamily.getBytes();
    // range has empty column family, so all Keys falling with Range Row
    // constraints should match
    if (start.length == 0 && stop.length == 0) {
        return true;
    }
    int result1 = WritableComparator.compareBytes(cfBytes, 0, cfBytes.length, start, 0, start.length);
    int result2 = WritableComparator.compareBytes(cfBytes, 0, cfBytes.length, stop, 0, stop.length);
    return result1 >= 0 && result2 <= 0;
}

From source file:org.apache.rya.indexing.accumulo.entity.AccumuloDocIdIndexer.java

License:Apache License

private QueryBindingSet deserializeKey(final Key key, final StarQuery sq, final BindingSet currentBs,
        final Set<String> unCommonVar) {

    final QueryBindingSet currentSolutionBs = new QueryBindingSet();

    final Text row = key.getRow();
    final Text cq = key.getColumnQualifier();

    final String[] cqArray = cq.toString().split(DocIndexIteratorUtil.DOC_ID_INDEX_DELIM);

    boolean commonVarSet = false;

    //if common Var is constant there is no common variable to assign a value to
    if (sq.commonVarConstant()) {
        commonVarSet = true;/* ww  w .  j  av a 2  s  . c o  m*/
    }

    if (!commonVarSet && sq.isCommonVarURI()) {
        final RyaURI rURI = new RyaURI(row.toString());
        currentSolutionBs.addBinding(sq.getCommonVarName(), RyaToRdfConversions.convertValue(rURI));
        commonVarSet = true;
    }

    for (final String s : sq.getUnCommonVars()) {

        final byte[] cqBytes = cqArray[sq.getVarPos().get(s)].getBytes(StandardCharsets.UTF_8);
        final int firstIndex = Bytes.indexOf(cqBytes, DELIM_BYTE);
        final int secondIndex = Bytes.lastIndexOf(cqBytes, DELIM_BYTE);
        final int typeIndex = Bytes.indexOf(cqBytes, TYPE_DELIM_BYTE);
        final String tripleComponent = new String(Arrays.copyOfRange(cqBytes, firstIndex + 1, secondIndex),
                StandardCharsets.UTF_8);
        final byte[] cqContent = Arrays.copyOfRange(cqBytes, secondIndex + 1, typeIndex);
        final byte[] objType = Arrays.copyOfRange(cqBytes, typeIndex, cqBytes.length);

        if (tripleComponent.equals("object")) {
            final byte[] object = Bytes.concat(cqContent, objType);
            org.openrdf.model.Value v = null;
            try {
                v = RyaToRdfConversions.convertValue(RyaContext.getInstance().deserialize(object));
            } catch (final RyaTypeResolverException e) {
                e.printStackTrace();
            }
            currentSolutionBs.addBinding(s, v);

        } else if (tripleComponent.equals("subject")) {
            if (!commonVarSet) {
                final byte[] object = Bytes.concat(row.getBytes(), objType);
                org.openrdf.model.Value v = null;
                try {
                    v = RyaToRdfConversions.convertValue(RyaContext.getInstance().deserialize(object));
                } catch (final RyaTypeResolverException e) {
                    e.printStackTrace();
                }
                currentSolutionBs.addBinding(sq.getCommonVarName(), v);
                commonVarSet = true;
            }
            final RyaURI rURI = new RyaURI(new String(cqContent, StandardCharsets.UTF_8));
            currentSolutionBs.addBinding(s, RyaToRdfConversions.convertValue(rURI));
        } else {
            throw new IllegalArgumentException("Invalid row.");
        }
    }
    for (final String s : unCommonVar) {
        currentSolutionBs.addBinding(s, currentBs.getValue(s));
    }
    return currentSolutionBs;
}

From source file:org.apache.tajo.storage.sequencefile.SequenceFileScanner.java

License:Apache License

@Override
public Tuple next() throws IOException {
    if (!more)//w  w w  .  jav a  2  s .  c  o m
        return null;

    long pos = reader.getPosition();
    boolean remaining = reader.next(EMPTY_KEY);

    if (pos >= end && reader.syncSeen()) {
        more = false;
    } else {
        more = remaining;
    }

    if (more) {
        Tuple tuple = null;
        byte[][] cells;

        if (hasBinarySerDe) {
            BytesWritable bytesWritable = new BytesWritable();
            reader.getCurrentValue(bytesWritable);
            tuple = makeTuple(bytesWritable);
            totalBytes += (long) bytesWritable.getBytes().length;
        } else {
            Text text = new Text();
            reader.getCurrentValue(text);
            cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap,
                    schema.getColumns().size());
            totalBytes += (long) text.getBytes().length;
            tuple = new LazyTuple(schema, cells, 0, nullChars, serde);
        }
        currentIdx++;
        return tuple;
    } else {
        return null;
    }
}

From source file:org.archive.jbs.Merge.java

License:Apache License

/**
 * Utility method to construct a JSON Object from a Text
 *//*from  www  . j  a  v a2  s .  c om*/
public static Document fromText(Text text) throws IOException {
    return new Document(new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "utf-8"));
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 *///from  ww w.j  a v  a2  s  . c o  m
protected final int positionAtFirstRecord(final FSDataInputStream stream, final CompressionCodec codec)
        throws IOException {
    Text buffer = new Text();
    long originalStart = start;

    LineReader reader;
    if (codec == null) {
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        reader = new LineReader(stream);
    } else {
        // Unlike the codec == null case, we don't seek before creating the
        // reader, SplittableCompressionCodec.createInputStream places the
        // stream at the start of the first compression block after our
        // split start
        //
        // as noted above, we need to be at pos 0 in the stream before
        // calling this
        reader = new LineReader(((SplittableCompressionCodec) codec).createInputStream(stream, null, start, end,
                SplittableCompressionCodec.READ_MODE.BYBLOCK));
    }

    int bytesRead = 0;
    do {
        bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
        int bufferLength = buffer.getLength();
        if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
            start += bytesRead;
        } else {

            // line starts with @.  Read two more and verify that it starts
            // with a +:
            //
            // @<readname>
            // <sequence>
            // +[readname]
            //
            // if the second line we read starts with a @, we know that
            // we've read:
            //
            // <qualities> <-- @ is a valid ASCII phred encoding
            // @<readname>
            //
            // and thus, the second read is the delimiter and we can break
            long trackForwardPosition = start + bytesRead;

            bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
            if (buffer.getLength() > 0 && buffer.getBytes()[0] == '@') {
                start = trackForwardPosition;
                break;
            } else {
                trackForwardPosition += bytesRead;
            }

            bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
            trackForwardPosition += bytesRead;
            if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                break; // all good!
            } else {
                start = trackForwardPosition;
            }
        }
    } while (bytesRead > 0);

    pos = start;
    start = originalStart;
    stream.seek(start);
    return (int) (pos - originalStart);
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Parses a read from an interleaved FASTQ file.
 *
 * Only reads a single record./*from www.  java  2  s  .  c o m*/
 *
 * @param readName Text record containing read name. Output parameter.
 * @param value Text record containing full record. Output parameter.
 * @return Returns true if read was successful (did not hit EOF).
 *
 * @throws RuntimeException Throws exception if FASTQ record doesn't
 *   have proper formatting (e.g., record doesn't start with @).
 */
protected final boolean lowLevelFastqRead(final Text readName, final Text value) throws IOException {

    if (endOfCompressedSplit) {
        return false;
    }

    // ID line
    readName.clear();
    long skipped = appendLineInto(readName, true);
    if (skipped == 0) {
        return false; // EOF
    }

    if (readName.getBytes()[0] != '@') {
        throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage()
                + ". Line: " + readName + ". \n");
    }
    value.append(readName.getBytes(), 0, readName.getLength());

    // sequence
    appendLineInto(value, false);

    // separator line
    appendLineInto(value, false);

    // quality
    appendLineInto(value, false);

    return true;
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line.//from  w w  w.  j  a  va 2 s  .  c  o  m
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, (int) Math.min(maxLineLength, end - start));

    // ok, so first, split/unsplit, compressed/uncompressed notwithstanding,
    // there are three cases we can run into:
    //
    // 1. we read data
    // 2. we are at an acceptable eof/end-of-split and don't read data
    // 3. we are at an unacceptable eof/end-of-split and don't read data
    //
    // cases 1 and 2 are consistent across split/unsplit, compressed/uncompressed.
    //
    // case 3 is simple in the unsplit or uncompressed cases; something has
    // gone wrong, we throw an EOFException, and move on with our lives
    //
    // case 3 is where working with split compressed files gets fun.
    //
    // with the split compression stream, the first time we read past the
    // end of the last compression block within a file split, we get no
    // bytes back. the BZip2Codec and BGZFCodec's actually tell us that
    // we'll get -2 back in this case, but we'll cast a wider net yet.
    //
    // this is important information---if we don't know this, we'll keep reading
    // past the end of the split to the end of the file---but we still need to
    // finish reading our multiline record, so we set some state to let us know
    // that we're reading the last record in the split (endOfCompressedSplit)
    // and repeat the read. if the read fails again, then that means that
    // something has actually gone wrong, and we want to fall through and
    // throw an EOFException or return no bytes read (depending on eofOk).
    // that's why we have the lastReadWasZeroBytes flag around. we set this
    // to true on the first read that gets bytesRead <= 0, and clear it on
    // any read that reads more than 0 bytes.
    if (isSplittable && isCompressed && !lastReadWasZeroBytes && bytesRead <= 0 && !eofOk) {

        // we need to clear the reader state so we can continue reading
        ((ResettableCompressedSplitLineReader) lineReader).reset();

        // set the state to stop us from reading another record and
        // to catch back-to-back failed reads
        lastReadWasZeroBytes = true;
        endOfCompressedSplit = true;

        // recursively call to redo the read
        return appendLineInto(dest, eofOk);
    } else if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) {
        throw new EOFException();
    } else {
        lastReadWasZeroBytes = false;
    }

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    if (isSplittable && isCompressed) {
        pos = ((SplitCompressionInputStream) inputStream).getPos();
    } else {
        pos += bytesRead;
    }

    return bytesRead;
}

From source file:org.cloudata.examples.web.DocFreqReduce.java

License:Apache License

public void reduce(WritableComparable key, Iterator<Writable> values,
        OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException {
    if (exception != null) {
        throw exception;
    }//from ww w  .j  a  va  2s  .com
    Text tKey = (Text) key;
    Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength());

    int docFreq = 0;
    while (values.hasNext()) {
        docFreq++;
    }

    Row row = new Row(rowKey);
    try {
        row.addCell("df", new Cell(Cell.Key.EMPTY_KEY, Long.toString(docFreq).getBytes()));
        termTable.put(row);
    } catch (Exception e) {
        LOG.error(e.getMessage(), e);
    }
}