Example usage for org.apache.hadoop.io BytesWritable set

List of usage examples for org.apache.hadoop.io BytesWritable set

Introduction

In this page you can find the example usage for org.apache.hadoop.io BytesWritable set.

Prototype

public void set(byte[] newData, int offset, int length) 

Source Link

Document

Set the value to a copy of the given byte range

Usage

From source file:org.apache.tez.runtime.library.common.comparator.TestProxyComparator.java

License:Apache License

private static final void set(BytesWritable bw, String s) {
    byte[] b = s.getBytes(Charset.forName("utf-8"));
    bw.set(b, 0, b.length);
}

From source file:org.commoncrawl.nutch.tools.arc.ArcRecordReader.java

License:Apache License

/**
 * <p>Returns true if the next record in the split is read into the key and 
 * value pair.  The key will be the arc record header and the values will be
 * the raw content bytes of the arc record.</p>
 * // w ww .ja  v a2s .  c o m
 * @param key The record key
 * @param value The record value
 * 
 * @return True if the next record is read.
 * 
 * @throws IOException If an error occurs while reading the record value.
 */
public boolean next(Text key, BytesWritable value) throws IOException {

    try {

        // get the starting position on the input stream
        long startRead = in.getPos();
        byte[] magicBuffer = null;

        // we need this loop to handle false positives in reading of gzip records
        while (true) {

            // while we haven't passed the end of the split
            if (startRead >= splitEnd) {
                return false;
            }

            // scanning for the gzip header
            boolean foundStart = false;
            while (!foundStart) {

                // start at the current file position and scan for 1K at time, break
                // if there is no more to read
                startRead = in.getPos();
                magicBuffer = new byte[1024];
                int read = in.read(magicBuffer);
                if (read < 0) {
                    break;
                }

                // scan the byte array for the gzip header magic number.  This happens
                // byte by byte
                for (int i = 0; i < read - 1; i++) {
                    byte[] testMagic = new byte[2];
                    System.arraycopy(magicBuffer, i, testMagic, 0, 2);
                    if (isMagic(testMagic)) {
                        // set the next start to the current gzip header
                        startRead += i;
                        foundStart = true;
                        break;
                    }
                }
            }

            // seek to the start of the gzip header
            in.seek(startRead);
            ByteArrayOutputStream baos = null;
            int totalRead = 0;

            try {

                // read 4K of the gzip at a time putting into a byte array
                byte[] buffer = new byte[4096];
                GZIPInputStream zin = new GZIPInputStream(in);
                int gzipRead = -1;
                baos = new ByteArrayOutputStream();
                while ((gzipRead = zin.read(buffer, 0, buffer.length)) != -1) {
                    baos.write(buffer, 0, gzipRead);
                    totalRead += gzipRead;
                }
            } catch (Exception e) {

                // there are times we get false positives where the gzip header exists
                // but it is not an actual gzip record, so we ignore it and start
                // over seeking
                // LOG.debug("Ignoring position: " + (startRead));
                if (startRead + 1 < fileLen) {
                    in.seek(startRead + 1);
                }
                continue;
            }

            // change the output stream to a byte array
            byte[] content = baos.toByteArray();

            // the first line of the raw content in arc files is the header
            int eol = 0;
            for (int i = 0; i < content.length; i++) {
                if (i > 0 && content[i] == '\n') {
                    eol = i;
                    break;
                }
            }

            // create the header and the raw content minus the header
            String header = new String(content, 0, eol).trim();
            byte[] raw = new byte[(content.length - eol) - 1];
            System.arraycopy(content, eol + 1, raw, 0, raw.length);

            // populate key and values with the header and raw content.
            Text keyText = (Text) key;
            keyText.set(header);
            BytesWritable valueBytes = (BytesWritable) value;
            valueBytes.set(raw, 0, raw.length);

            // TODO: It would be best to start at the end of the gzip read but 
            // the bytes read in gzip don't match raw bytes in the file so we 
            // overshoot the next header.  With this current method you get
            // some false positives but don't miss records.
            if (startRead + 1 < fileLen) {
                in.seek(startRead + 1);
            }

            // populated the record, now return
            return true;
        }
    } catch (Exception e) {
        LOG.equals(StringUtils.stringifyException(e));
    }

    // couldn't populate the record or there is no next record to read
    return false;
}

From source file:org.geotools.WholeFile.WholeFileRecordReader.java

License:Apache License

@Override
public boolean next(Text key, BytesWritable value) throws IOException {
    if (!processed) {
        byte[] contents = new byte[(int) fileSplit.getLength()];
        Path file = fileSplit.getPath();

        String fileName = file.getName();
        key.set(fileName);//w ww.j  a  va 2  s .c o m

        FileSystem fs = file.getFileSystem(conf);
        FSDataInputStream in = null;
        try {
            in = fs.open(file);
            IOUtils.readFully(in, contents, 0, contents.length);
            value.set(contents, 0, contents.length);
        } finally {
            IOUtils.closeStream(in);
        }
        processed = true;
        return true;
    }
    return false;
}

From source file:org.pentaho.hadoop.mapreduce.converter.converters.KettleTypeToBytesWritableConverter.java

License:Apache License

@Override
public BytesWritable convert(ValueMetaInterface meta, Object obj) throws TypeConversionException {
    try {//from w w  w  .ja  v  a  2s  .  c o  m
        BytesWritable result = new BytesWritable();
        byte[] binary = meta.getBinary(obj);
        result.set(binary, 0, binary.length);
        return result;
    } catch (Exception ex) {
        throw new TypeConversionException(BaseMessages.getString(TypeConverterFactory.class, "ErrorConverting",
                BytesWritable.class.getSimpleName(), obj), ex);
    }
}

From source file:org.pooledtimeseries.cartesian.CartesianRecordReader.java

License:Apache License

@Override
public boolean next(Text key, BytesWritable value) throws IOException {

    do {//from w  ww . jav a 2  s.  c  om
        // If we are to go to the next left key/value pair
        if (goToNextLeft) {
            // Read the next key value pair, false means no more pairs
            if (!leftRR.next(lkey, lvalue)) {
                // If no more, then this task is nearly finished
                alldone = true;
                break;
            } else {
                // If we aren't done, set the value to the key and set
                // our flags
                goToNextLeft = alldone = false;

                // Reset the right record reader
                this.rightRR = this.rightFIF.getRecordReader(this.rightIS, this.rightConf, this.rightReporter);
            }

            if (this.pairWithItself) {
                // shifting right data set to avoid repeated pairs
                // we consider a,b == b,a
                for (int i = 0; i < rightShiftCount; i++) {
                    rightRR.next(rkey, rvalue);
                }
                rightShiftCount++;
            }
        }

        // Read the next key value pair from the right data set
        if (rightRR.next(rkey, rvalue)) {
            // If success, set key and value for left and right splits
            key.set(lkey.toString() + "~" + rkey.toString());
            // Merge FeatureVector of both videos
            // Order is important and should be same as order of key
            List<FeatureVector> featureList = (List<FeatureVector>) PoTSerialiser.getObject(lvalue.getBytes());
            featureList.addAll((List<FeatureVector>) PoTSerialiser.getObject(rvalue.getBytes()));
            byte[] featureListBytes = PoTSerialiser.getBytes(featureList);
            value.set(featureListBytes, 0, featureListBytes.length);

            // This assumes that key will always be unique among all splits
            if (lkey.toString().equals(rkey.toString())) {
                this.pairWithItself = true;
            }
        } else {
            // Otherwise, this right data set is complete
            // and we should go to the next left pair
            goToNextLeft = true;
        }

        // This loop will continue if we finished reading key/value
        // pairs from the right data set
    } while (goToNextLeft);

    if (alldone) {
        // reset shift counter
        rightShiftCount = 1;
        this.pairWithItself = false;
    }
    // Return true if a key/value pair was read, false otherwise
    return !alldone;
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinBlockRecordReader.java

License:Apache License

/**
*
* Read a next block. //from   ww w.  j  a  v a2 s  . c  o m
*
* @param key is a 64 byte array (hashMerkleRoot and prevHashBlock)
* @param value is a deserialized Java object of class BitcoinBlock
*
* @return true if next block is available, false if not
*/
public boolean next(BytesWritable key, BitcoinBlock value) throws IOException {
    // read all the blocks, if necessary a block overlapping a split
    while (getFilePosition() <= getEnd()) { // did we already went beyond the split (remote) or do we have no further data left?
        BitcoinBlock dataBlock = null;
        try {
            dataBlock = getBbr().readBlock();

        } catch (BitcoinBlockReadException e) {
            // log
            LOG.error(e);
        }
        if (dataBlock == null)
            return false;
        byte[] hashMerkleRoot = dataBlock.getHashMerkleRoot();
        byte[] hashPrevBlock = dataBlock.getHashPrevBlock();
        byte[] newKey = new byte[hashMerkleRoot.length + hashPrevBlock.length];
        for (int i = 0; i < hashMerkleRoot.length; i++) {
            newKey[i] = hashMerkleRoot[i];
        }
        for (int j = 0; j < hashPrevBlock.length; j++) {
            newKey[j + hashMerkleRoot.length] = hashPrevBlock[j];
        }
        key.set(newKey, 0, newKey.length);
        value.set(dataBlock);
        return true;
    }
    return false;
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinRawBlockRecordReader.java

License:Apache License

/**
*
* Read a next block. //from   www .  j a  va  2s  .c o  m
*
* @param key is a 64 byte array (hashMerkleRoot and prevHashBlock)
* @param value is a deserialized Java object of class BitcoinBlock
*
* @return true if next block is available, false if not
*/
public boolean next(BytesWritable key, BytesWritable value) throws IOException {
    // read all the blocks, if necessary a block overlapping a split
    while (getFilePosition() <= getEnd()) { // did we already went beyond the split (remote) or do we have no further data left?
        ByteBuffer dataBlock = null;
        try {
            dataBlock = getBbr().readRawBlock();
        } catch (BitcoinBlockReadException e) {
            // log
            LOG.error(e);
        }
        if (dataBlock == null)
            return false;
        byte newKey[] = getBbr().getKeyFromRawBlock(dataBlock);
        key.set(newKey, 0, newKey.length);
        byte[] dataBlockArray = null;
        if (dataBlock.hasArray() == true) {
            dataBlockArray = dataBlock.array();
        } else {
            dataBlockArray = new byte[dataBlock.capacity()];
            dataBlock.get(dataBlockArray);
        }
        value.set(dataBlockArray, 0, dataBlockArray.length);
        return true;
    }
    return false;
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinTransactionRecordReader.java

License:Apache License

/**
*
* Read a next block. /*  ww w.  ja v  a2 s.  c  o  m*/
*
* @param key is a 68 byte array (hashMerkleRoot, prevHashBlock, transActionCounter)
* @param value is a deserialized Java object of class BitcoinBlock
*
* @return true if next block is available, false if not
*/
public boolean next(BytesWritable key, BitcoinTransaction value) throws IOException {
    // read all the blocks, if necessary a block overlapping a split
    while (getFilePosition() <= getEnd()) { // did we already went beyond the split (remote) or do we have no further data left?
        if ((currentBitcoinBlock == null)
                || (currentBitcoinBlock.getTransactions().size() == currentTransactionCounterInBlock)) {
            try {
                currentBitcoinBlock = getBbr().readBlock();
                currentTransactionCounterInBlock = 0;
            } catch (BitcoinBlockReadException e) {
                // log
                LOG.error(e);
            }
        }

        if (currentBitcoinBlock == null)
            return false;
        BitcoinTransaction currentTransaction = currentBitcoinBlock.getTransactions()
                .get(currentTransactionCounterInBlock);
        // the unique identifier that is linked in other transaction is usually its hash
        byte[] newKey = new byte[0];
        try {
            newKey = BitcoinUtil.getTransactionHash(currentTransaction);
        } catch (NoSuchAlgorithmException nsae) {
            LOG.error("Cannot calculate transaction hash. Algorithm not available. Exception: "
                    + nsae.toString());
        }
        key.set(newKey, 0, newKey.length);
        value.set(currentTransaction);
        currentTransactionCounterInBlock++;
        return true;
    }
    return false;
}

From source file:phoenix.datatorrent.operator.input.kafka.PartitionableKafkaInputOperator.java

License:Open Source License

private BytesWritable getBytesFromKafka(Message msg) {
    ByteBuffer buf = msg.payload();
    BytesWritable payload = new BytesWritable();
    int origSize = buf.remaining();
    byte[] bytes = new byte[origSize];
    buf.get(bytes, buf.position(), origSize);
    payload.set(bytes, 0, origSize);
    return payload;
}

From source file:protobuf.mapred.ProtobufRecordReader.java

License:Open Source License

public synchronized boolean next(LongWritable key, BytesWritable value) throws IOException {

    if (recordReader.isBlockConsumed(splitLength)) {
        LOG.info("Consumed all the split");
        key = null;//from   w  ww .jav  a  2 s .c o  m
        value = null;
        return false;
    }
    recordio.RecordReader.Buffer data = recordReader.read();
    if (data == null) {
        LOG.info("get EOF, consumed all the file");
        key = null;
        value = null;
        return false;
    }
    pos = recordReader.getConsumedBytes() + start;
    key.set(pos);
    value.set(data.buffer, data.offset, data.length);
    return true;
}