Example usage for org.apache.hadoop.io BytesWritable set

Introduction

In this page you can find the example usage for org.apache.hadoop.io BytesWritable set.

Prototype

public void set(byte[] newData, int offset, int length)

Source Link

Document

Set the value to a copy of the given byte range

Usage

From source file:org.apache.tez.runtime.library.common.comparator.TestProxyComparator.java

License:Apache License

private static final void set(BytesWritable bw, String s) {
    byte[] b = s.getBytes(Charset.forName("utf-8"));
    bw.set(b, 0, b.length);
}

From source file:org.commoncrawl.nutch.tools.arc.ArcRecordReader.java

License:Apache License

/**
 * <p>Returns true if the next record in the split is read into the key and 
 * value pair.  The key will be the arc record header and the values will be
 * the raw content bytes of the arc record.</p>
 * // w ww .ja  v a2s .  c o m
 * @param key The record key
 * @param value The record value
 * 
 * @return True if the next record is read.
 * 
 * @throws IOException If an error occurs while reading the record value.
 */
public boolean next(Text key, BytesWritable value) throws IOException {

    try {

        // get the starting position on the input stream
        long startRead = in.getPos();
        byte[] magicBuffer = null;

        // we need this loop to handle false positives in reading of gzip records
        while (true) {

            // while we haven't passed the end of the split
            if (startRead >= splitEnd) {
                return false;
            }

            // scanning for the gzip header
            boolean foundStart = false;
            while (!foundStart) {

                // start at the current file position and scan for 1K at time, break
                // if there is no more to read
                startRead = in.getPos();
                magicBuffer = new byte[1024];
                int read = in.read(magicBuffer);
                if (read < 0) {
                    break;
                }

                // scan the byte array for the gzip header magic number.  This happens
                // byte by byte
                for (int i = 0; i < read - 1; i++) {
                    byte[] testMagic = new byte[2];
                    System.arraycopy(magicBuffer, i, testMagic, 0, 2);
                    if (isMagic(testMagic)) {
                        // set the next start to the current gzip header
                        startRead += i;
                        foundStart = true;
                        break;
                    }
                }
            }

            // seek to the start of the gzip header
            in.seek(startRead);
            ByteArrayOutputStream baos = null;
            int totalRead = 0;

            try {

                // read 4K of the gzip at a time putting into a byte array
                byte[] buffer = new byte[4096];
                GZIPInputStream zin = new GZIPInputStream(in);
                int gzipRead = -1;
                baos = new ByteArrayOutputStream();
                while ((gzipRead = zin.read(buffer, 0, buffer.length)) != -1) {
                    baos.write(buffer, 0, gzipRead);
                    totalRead += gzipRead;
                }
            } catch (Exception e) {

                // there are times we get false positives where the gzip header exists
                // but it is not an actual gzip record, so we ignore it and start
                // over seeking
                // LOG.debug("Ignoring position: " + (startRead));
                if (startRead + 1 < fileLen) {
                    in.seek(startRead + 1);
                }
                continue;
            }

            // change the output stream to a byte array
            byte[] content = baos.toByteArray();

            // the first line of the raw content in arc files is the header
            int eol = 0;
            for (int i = 0; i < content.length; i++) {
                if (i > 0 && content[i] == '\n') {
                    eol = i;
                    break;
                }
            }

            // create the header and the raw content minus the header
            String header = new String(content, 0, eol).trim();
            byte[] raw = new byte[(content.length - eol) - 1];
            System.arraycopy(content, eol + 1, raw, 0, raw.length);

            // populate key and values with the header and raw content.
            Text keyText = (Text) key;
            keyText.set(header);
            BytesWritable valueBytes = (BytesWritable) value;
            valueBytes.set(raw, 0, raw.length);

            // TODO: It would be best to start at the end of the gzip read but 
            // the bytes read in gzip don't match raw bytes in the file so we 
            // overshoot the next header.  With this current method you get
            // some false positives but don't miss records.
            if (startRead + 1 < fileLen) {
                in.seek(startRead + 1);
            }

            // populated the record, now return
            return true;
        }
    } catch (Exception e) {
        LOG.equals(StringUtils.stringifyException(e));
    }

    // couldn't populate the record or there is no next record to read
    return false;
}

From source file:org.geotools.WholeFile.WholeFileRecordReader.java

License:Apache License

@Override
public boolean next(Text key, BytesWritable value) throws IOException {
    if (!processed) {
        byte[] contents = new byte[(int) fileSplit.getLength()];
        Path file = fileSplit.getPath();

        String fileName = file.getName();
        key.set(fileName);//w ww.j  a  va 2  s .c o m

        FileSystem fs = file.getFileSystem(conf);
        FSDataInputStream in = null;
        try {
            in = fs.open(file);
            IOUtils.readFully(in, contents, 0, contents.length);
            value.set(contents, 0, contents.length);
        } finally {
            IOUtils.closeStream(in);
        }
        processed = true;
        return true;
    }
    return false;
}

From source file:org.pentaho.hadoop.mapreduce.converter.converters.KettleTypeToBytesWritableConverter.java

License:Apache License

@Override
public BytesWritable convert(ValueMetaInterface meta, Object obj) throws TypeConversionException {
    try {//from w w  w  .ja  v  a  2s  .  c o  m
        BytesWritable result = new BytesWritable();
        byte[] binary = meta.getBinary(obj);
        result.set(binary, 0, binary.length);
        return result;
    } catch (Exception ex) {
        throw new TypeConversionException(BaseMessages.getString(TypeConverterFactory.class, "ErrorConverting",
                BytesWritable.class.getSimpleName(), obj), ex);
    }
}

From source file:org.pooledtimeseries.cartesian.CartesianRecordReader.java

License:Apache License

@Override
public boolean next(Text key, BytesWritable value) throws IOException {

    do {//from w  ww . jav a 2  s.  c  om
        // If we are to go to the next left key/value pair
        if (goToNextLeft) {
            // Read the next key value pair, false means no more pairs
            if (!leftRR.next(lkey, lvalue)) {
                // If no more, then this task is nearly finished
                alldone = true;
                break;
            } else {
                // If we aren't done, set the value to the key and set
                // our flags
                goToNextLeft = alldone = false;

                // Reset the right record reader
                this.rightRR = this.rightFIF.getRecordReader(this.rightIS, this.rightConf, this.rightReporter);
            }

            if (this.pairWithItself) {
                // shifting right data set to avoid repeated pairs
                // we consider a,b == b,a
                for (int i = 0; i < rightShiftCount; i++) {
                    rightRR.next(rkey, rvalue);
                }
                rightShiftCount++;
            }
        }

        // Read the next key value pair from the right data set
        if (rightRR.next(rkey, rvalue)) {
            // If success, set key and value for left and right splits
            key.set(lkey.toString() + "~" + rkey.toString());
            // Merge FeatureVector of both videos
            // Order is important and should be same as order of key
            List<FeatureVector> featureList = (List<FeatureVector>) PoTSerialiser.getObject(lvalue.getBytes());
            featureList.addAll((List<FeatureVector>) PoTSerialiser.getObject(rvalue.getBytes()));
            byte[] featureListBytes = PoTSerialiser.getBytes(featureList);
            value.set(featureListBytes, 0, featureListBytes.length);

            // This assumes that key will always be unique among all splits
            if (lkey.toString().equals(rkey.toString())) {
                this.pairWithItself = true;
            }
        } else {
            // Otherwise, this right data set is complete
            // and we should go to the next left pair
            goToNextLeft = true;
        }

        // This loop will continue if we finished reading key/value
        // pairs from the right data set
    } while (goToNextLeft);

    if (alldone) {
        // reset shift counter
        rightShiftCount = 1;
        this.pairWithItself = false;
    }
    // Return true if a key/value pair was read, false otherwise
    return !alldone;
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinBlockRecordReader.java

License:Apache License

/**
*
* Read a next block. //from   ww w.  j  a  v a2 s  . c  o m
*
* @param key is a 64 byte array (hashMerkleRoot and prevHashBlock)
* @param value is a deserialized Java object of class BitcoinBlock
*
* @return true if next block is available, false if not
*/
public boolean next(BytesWritable key, BitcoinBlock value) throws IOException {
    // read all the blocks, if necessary a block overlapping a split
    while (getFilePosition() <= getEnd()) { // did we already went beyond the split (remote) or do we have no further data left?
        BitcoinBlock dataBlock = null;
        try {
            dataBlock = getBbr().readBlock();

        } catch (BitcoinBlockReadException e) {
            // log
            LOG.error(e);
        }
        if (dataBlock == null)
            return false;
        byte[] hashMerkleRoot = dataBlock.getHashMerkleRoot();
        byte[] hashPrevBlock = dataBlock.getHashPrevBlock();
        byte[] newKey = new byte[hashMerkleRoot.length + hashPrevBlock.length];
        for (int i = 0; i < hashMerkleRoot.length; i++) {
            newKey[i] = hashMerkleRoot[i];
        }
        for (int j = 0; j < hashPrevBlock.length; j++) {
            newKey[j + hashMerkleRoot.length] = hashPrevBlock[j];
        }
        key.set(newKey, 0, newKey.length);
        value.set(dataBlock);
        return true;
    }
    return false;
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinRawBlockRecordReader.java

License:Apache License

/**
*
* Read a next block. //from   www .  j a  va  2s  .c o  m
*
* @param key is a 64 byte array (hashMerkleRoot and prevHashBlock)
* @param value is a deserialized Java object of class BitcoinBlock
*
* @return true if next block is available, false if not
*/
public boolean next(BytesWritable key, BytesWritable value) throws IOException {
    // read all the blocks, if necessary a block overlapping a split
    while (getFilePosition() <= getEnd()) { // did we already went beyond the split (remote) or do we have no further data left?
        ByteBuffer dataBlock = null;
        try {
            dataBlock = getBbr().readRawBlock();
        } catch (BitcoinBlockReadException e) {
            // log
            LOG.error(e);
        }
        if (dataBlock == null)
            return false;
        byte newKey[] = getBbr().getKeyFromRawBlock(dataBlock);
        key.set(newKey, 0, newKey.length);
        byte[] dataBlockArray = null;
        if (dataBlock.hasArray() == true) {
            dataBlockArray = dataBlock.array();
        } else {
            dataBlockArray = new byte[dataBlock.capacity()];
            dataBlock.get(dataBlockArray);
        }
        value.set(dataBlockArray, 0, dataBlockArray.length);
        return true;
    }
    return false;
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinTransactionRecordReader.java

License:Apache License

/**
*
* Read a next block. /*  ww w.  ja v  a2 s.  c  o  m*/
*
* @param key is a 68 byte array (hashMerkleRoot, prevHashBlock, transActionCounter)
* @param value is a deserialized Java object of class BitcoinBlock
*
* @return true if next block is available, false if not
*/
public boolean next(BytesWritable key, BitcoinTransaction value) throws IOException {
    // read all the blocks, if necessary a block overlapping a split
    while (getFilePosition() <= getEnd()) { // did we already went beyond the split (remote) or do we have no further data left?
        if ((currentBitcoinBlock == null)
                || (currentBitcoinBlock.getTransactions().size() == currentTransactionCounterInBlock)) {
            try {
                currentBitcoinBlock = getBbr().readBlock();
                currentTransactionCounterInBlock = 0;
            } catch (BitcoinBlockReadException e) {
                // log
                LOG.error(e);
            }
        }

        if (currentBitcoinBlock == null)
            return false;
        BitcoinTransaction currentTransaction = currentBitcoinBlock.getTransactions()
                .get(currentTransactionCounterInBlock);
        // the unique identifier that is linked in other transaction is usually its hash
        byte[] newKey = new byte[0];
        try {
            newKey = BitcoinUtil.getTransactionHash(currentTransaction);
        } catch (NoSuchAlgorithmException nsae) {
            LOG.error("Cannot calculate transaction hash. Algorithm not available. Exception: "
                    + nsae.toString());
        }
        key.set(newKey, 0, newKey.length);
        value.set(currentTransaction);
        currentTransactionCounterInBlock++;
        return true;
    }
    return false;
}

From source file:phoenix.datatorrent.operator.input.kafka.PartitionableKafkaInputOperator.java

License:Open Source License

private BytesWritable getBytesFromKafka(Message msg) {
    ByteBuffer buf = msg.payload();
    BytesWritable payload = new BytesWritable();
    int origSize = buf.remaining();
    byte[] bytes = new byte[origSize];
    buf.get(bytes, buf.position(), origSize);
    payload.set(bytes, 0, origSize);
    return payload;
}

From source file:protobuf.mapred.ProtobufRecordReader.java

License:Open Source License

public synchronized boolean next(LongWritable key, BytesWritable value) throws IOException {

    if (recordReader.isBlockConsumed(splitLength)) {
        LOG.info("Consumed all the split");
        key = null;//from   w  ww .jav  a  2 s .c o  m
        value = null;
        return false;
    }
    recordio.RecordReader.Buffer data = recordReader.read();
    if (data == null) {
        LOG.info("get EOF, consumed all the file");
        key = null;
        value = null;
        return false;
    }
    pos = recordReader.getConsumedBytes() + start;
    key.set(pos);
    value.set(data.buffer, data.offset, data.length);
    return true;
}