Example usage for org.apache.hadoop.io BytesWritable getBytes

List of usage examples for org.apache.hadoop.io BytesWritable getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io BytesWritable getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Get the data backing the BytesWritable.

Usage

From source file:org.apache.sqoop.mapreduce.hcat.SqoopHCatImportHelper.java

License:Apache License

private Object toHCat(Object val, HCatFieldSchema.Type hfsType, String hCatTypeString) {

    if (val == null) {
        return null;
    }// ww w  .ja v a 2s .  co m

    Object retVal = null;

    if (val instanceof Number) {
        retVal = convertNumberTypes(val, hfsType);
    } else if (val instanceof Boolean) {
        retVal = convertBooleanTypes(val, hfsType);
    } else if (val instanceof String) {
        if (hfsType == HCatFieldSchema.Type.STRING) {
            String str = (String) val;
            if (doHiveDelimsReplacement) {
                retVal = FieldFormatter.hiveStringReplaceDelims(str, hiveDelimsReplacement, hiveDelimiters);
            } else {
                retVal = str;
            }
        }
    } else if (val instanceof java.util.Date) {
        retVal = converDateTypes(val, hfsType);
    } else if (val instanceof BytesWritable) {
        if (hfsType == HCatFieldSchema.Type.BINARY) {
            BytesWritable bw = (BytesWritable) val;
            retVal = bw.getBytes();
        }
    } else if (val instanceof BlobRef) {
        if (hfsType == HCatFieldSchema.Type.BINARY) {
            BlobRef br = (BlobRef) val;
            byte[] bytes = br.isExternal() ? br.toString().getBytes() : br.getData();
            retVal = bytes;
        }
    } else if (val instanceof ClobRef) {
        if (hfsType == HCatFieldSchema.Type.STRING) {
            ClobRef cr = (ClobRef) val;
            String s = cr.isExternal() ? cr.toString() : cr.getData();
            retVal = s;
        }
    } else {
        throw new UnsupportedOperationException(
                "Objects of type " + val.getClass().getName() + " are not suported");
    }
    if (retVal == null) {
        LOG.error("Objects of type " + val.getClass().getName() + " can not be mapped to HCatalog type "
                + hCatTypeString);
    }
    return retVal;
}

From source file:org.apache.tajo.storage.sequencefile.SequenceFileScanner.java

License:Apache License

@Override
public Tuple next() throws IOException {
    if (!more)/*from w ww. j a va2s. co m*/
        return null;

    long pos = reader.getPosition();
    boolean remaining = reader.next(EMPTY_KEY);

    if (pos >= end && reader.syncSeen()) {
        more = false;
    } else {
        more = remaining;
    }

    if (more) {
        Tuple tuple = null;
        byte[][] cells;

        if (hasBinarySerDe) {
            BytesWritable bytesWritable = new BytesWritable();
            reader.getCurrentValue(bytesWritable);
            tuple = makeTuple(bytesWritable);
            totalBytes += (long) bytesWritable.getBytes().length;
        } else {
            Text text = new Text();
            reader.getCurrentValue(text);
            cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap,
                    schema.getColumns().size());
            totalBytes += (long) text.getBytes().length;
            tuple = new LazyTuple(schema, cells, 0, nullChars, serde);
        }
        currentIdx++;
        return tuple;
    } else {
        return null;
    }
}

From source file:org.apache.tajo.storage.sequencefile.SequenceFileScanner.java

License:Apache License

/**
 * In hive, LazyBinarySerDe is serialized as follows: start A B A B A B end bytes[] ->
 * |-----|---------|--- ... ---|-----|---------|
 *
 * Section A is one null-byte, corresponding to eight struct fields in Section
 * B. Each bit indicates whether the corresponding field is null (0) or not null
 * (1). Each field is a LazyBinaryObject.
 *
 * Following B, there is another section A and B. This pattern repeats until the
 * all struct fields are serialized.//  w  w w.j a  v  a2 s  . c o  m
 *
 * So, tajo must make a tuple after parsing hive style BinarySerDe.
 */
private Tuple makeTuple(BytesWritable value) throws IOException {
    Tuple tuple = new VTuple(schema.getColumns().size());

    int start = 0;
    int length = value.getLength();

    /**
     * Please note that one null byte is followed by eight fields, then more
     * null byte and fields.
     */
    int structByteEnd = start + length;
    byte[] bytes = value.getBytes();

    byte nullByte = bytes[start];
    int lastFieldByteEnd = start + 1;

    // Go through all bytes in the byte[]
    for (int i = 0; i < schema.getColumns().size(); i++) {
        fieldIsNull[i] = true;
        if ((nullByte & (1 << (i % 8))) != 0) {
            fieldIsNull[i] = false;
            parse(schema.getColumn(i), bytes, lastFieldByteEnd);

            fieldStart[i] = lastFieldByteEnd + elementOffset;
            fieldLength[i] = elementSize;
            lastFieldByteEnd = fieldStart[i] + fieldLength[i];

            for (int j = 0; j < projectionMap.length; j++) {
                if (projectionMap[j] == i) {
                    Datum datum = serde.deserialize(schema.getColumn(i), bytes, fieldStart[i], fieldLength[i],
                            nullChars);
                    tuple.put(i, datum);
                }
            }
        }

        // next byte is a null byte if there are more bytes to go
        if (7 == (i % 8)) {
            if (lastFieldByteEnd < structByteEnd) {
                nullByte = bytes[lastFieldByteEnd];
                lastFieldByteEnd++;
            } else {
                // otherwise all null afterwards
                nullByte = 0;
                lastFieldByteEnd++;
            }
        }
    }

    return tuple;
}

From source file:org.apache.tez.runtime.library.common.comparator.TezBytesComparator.java

License:Apache License

@Override
public int getProxy(BytesWritable key) {
    int prefix = 0;
    final int len = key.getLength();
    final byte[] content = key.getBytes();
    int b1 = 0, b2 = 0, b3 = 0;
    switch (len) {
    default:// w ww .ja  va 2  s  .  c o  m
    case 3:
        b3 = content[2] & 0xff;
    case 2:
        b2 = content[1] & 0xff;
    case 1:
        b1 = content[0] & 0xff;
    case 0:
    }
    prefix = (b1 << 16) | (b2 << 8) | (b3);
    return prefix;
}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ...
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(itemIndex++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);//from  ww  w  .j a va2s  .c  o  m
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    JobConf conf = new JobConf();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {/*  w  ww.j  ava 2 s .c o  m*/
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(conf, split);

    int index = 0;

    // iterate and validate stuff ... 
    Text key = reader.createKey();
    BytesWritable value = reader.createValue();

    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java

License:Apache License

static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(itemIndex++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);// w  w w  . j a  v a 2s  .  co  m
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    Configuration conf = new Configuration();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {//from ww  w .  jav  a2s.  c  o m
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID()));

    int index = 0;

    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}

From source file:org.commoncrawl.util.shared.ARCFileReader.java

License:Apache License

public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException {

    Configuration conf = new Configuration();

    String path = null;//w w w .  java  2s .  c  o  m

    CommandLineParser parser = new GnuParser();

    try {
        // parse the command line arguments
        CommandLine cmdLine = parser.parse(options, args);

        // get ARCFile Path
        path = cmdLine.getOptionValue("file");

        // get optional config 
        if (cmdLine.hasOption("conf")) {
            conf.addResource(new Path(cmdLine.getOptionValue("conf")));
        }
        if (cmdLine.hasOption("awsAccessKey")) {
            conf.set("fs.s3n.awsAccessKeyId", cmdLine.getOptionValue("awsAccessKey"));
        }
        if (cmdLine.hasOption("awsSecret")) {
            conf.set("fs.s3n.awsSecretAccessKey", cmdLine.getOptionValue("awsSecret"));
        }
    } catch (ParseException e) {
        System.out.println(e.toString());
        printUsage();
        System.exit(1);
    }

    final URI uri = new URI(path);
    FileSystem fs = FileSystem.get(uri, conf);

    //    byte data[] = new byte[4096*10];
    //    int readAmt = 0;
    //    while ((readAmt = stream.get().read(data)) != -1) { 
    //      System.out.println(HexDump.dumpHexString(data, 0, readAmt));
    //    }
    //    stream.get().close();
    //    System.exit(1);

    ARCFileReader reader = null;

    try {
        System.out.println("Initializing Reader for Path:" + uri);
        reader = new ARCFileReader(fs.open(new Path(path)));

        Text key = new Text();
        BytesWritable value = new BytesWritable();

        while (reader.hasMoreItems()) {
            reader.nextKeyValue(key, value);
            int indexOfTrailingCRLF = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            int headerLen = indexOfTrailingCRLF + 4;
            int contentLen = value.getLength() - headerLen;

            String outputStr = "Key:" + key.toString() + " HeaderLen:" + headerLen + " ContentLen:"
                    + contentLen;
            System.out.println(outputStr);

            //String contentStr = new String(value.getBytes(),headerLen,contentLen,Charset.forName("ASCII"));
            //System.out.println(contentStr.substring(contentStr.length() - 20));
        }
        System.out.println("Exiting Loop");
    } catch (Exception e) {
        System.out.println(CCStringUtils.stringifyException(e));
        LOG.error(CCStringUtils.stringifyException(e));
        //throw new IOException(e);
    } finally {
        if (reader != null) {
            System.out.println("***Closing Reader");
            reader.close();
        }
    }
}

From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java

License:Apache License

/** 
 * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... 
 *///from  w ww.jav  a  2 s  . c o  m
@Test
public void testReader() {
    DataOutputBuffer os = new DataOutputBuffer();
    long timestamp = System.currentTimeMillis();
    try {
        // write the ARC File into memory 
        writeFirstRecord(os, "test", timestamp);
        List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            NIOHttpHeaders headers = new NIOHttpHeaders();
            for (int i = 0; i < record.headers.size(); ++i) {
                headers.set(record.headers.get(i).e0, record.headers.get(i).e1);
            }

            write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html",
                    MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
        }
        os.flush();
        os.close();

        final AtomicBoolean streamClosed = new AtomicBoolean();
        // setup ArcFileReader to read the file 
        InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) {

            public synchronized int read(byte b[], int off, int len) {
                len = 1;
                return super.read(b, off, len);
            }

            public void close() throws IOException {
                super.close();
                streamClosed.set(true);
            }
        };
        ARCFileReader reader = new ARCFileReader(in);
        int index = 0;
        Text key = new Text();
        BytesWritable value = new BytesWritable();

        // iterate and validate stuff ... 
        while (reader.hasMoreItems()) {
            reader.nextKeyValue(key, value);
            TestRecord testRecord = records.get(index++);
            // get test key bytes as utf-8 bytes ... 
            byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
            // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
            // with ?, which causes our test case (which does use invalid characters to from the key, to break.
            Assert.assertTrue(
                    compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);
            // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
            // we search for this specific byte pattern to locate start of content, then compare it against source ... 
            int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            if (indexofHeaderTerminator == -1) {
                throw new IOException("No Header Terminator found in Value!");
            }
            indexofHeaderTerminator += 4;
            // read headers ... 
            String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator,
                    Charset.forName("UTF-8"));
            NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
            for (int i = 0; i < testRecord.headers.size(); ++i) {
                Pair<String, String> testHeaderRecord = testRecord.headers.get(i);
                Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
                Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0));
            }

            Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(),
                    indexofHeaderTerminator, testRecord.data.length) == 0);
        }
        reader.close();

        Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT);
        Assert.assertTrue(streamClosed.get());
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}