List of usage examples for org.apache.hadoop.io BytesWritable getBytes
@Override public byte[] getBytes()
From source file:org.apache.sqoop.mapreduce.hcat.SqoopHCatImportHelper.java
License:Apache License
private Object toHCat(Object val, HCatFieldSchema.Type hfsType, String hCatTypeString) { if (val == null) { return null; }// ww w .ja v a 2s . co m Object retVal = null; if (val instanceof Number) { retVal = convertNumberTypes(val, hfsType); } else if (val instanceof Boolean) { retVal = convertBooleanTypes(val, hfsType); } else if (val instanceof String) { if (hfsType == HCatFieldSchema.Type.STRING) { String str = (String) val; if (doHiveDelimsReplacement) { retVal = FieldFormatter.hiveStringReplaceDelims(str, hiveDelimsReplacement, hiveDelimiters); } else { retVal = str; } } } else if (val instanceof java.util.Date) { retVal = converDateTypes(val, hfsType); } else if (val instanceof BytesWritable) { if (hfsType == HCatFieldSchema.Type.BINARY) { BytesWritable bw = (BytesWritable) val; retVal = bw.getBytes(); } } else if (val instanceof BlobRef) { if (hfsType == HCatFieldSchema.Type.BINARY) { BlobRef br = (BlobRef) val; byte[] bytes = br.isExternal() ? br.toString().getBytes() : br.getData(); retVal = bytes; } } else if (val instanceof ClobRef) { if (hfsType == HCatFieldSchema.Type.STRING) { ClobRef cr = (ClobRef) val; String s = cr.isExternal() ? cr.toString() : cr.getData(); retVal = s; } } else { throw new UnsupportedOperationException( "Objects of type " + val.getClass().getName() + " are not suported"); } if (retVal == null) { LOG.error("Objects of type " + val.getClass().getName() + " can not be mapped to HCatalog type " + hCatTypeString); } return retVal; }
From source file:org.apache.tajo.storage.sequencefile.SequenceFileScanner.java
License:Apache License
@Override public Tuple next() throws IOException { if (!more)/*from w ww. j a va2s. co m*/ return null; long pos = reader.getPosition(); boolean remaining = reader.next(EMPTY_KEY); if (pos >= end && reader.syncSeen()) { more = false; } else { more = remaining; } if (more) { Tuple tuple = null; byte[][] cells; if (hasBinarySerDe) { BytesWritable bytesWritable = new BytesWritable(); reader.getCurrentValue(bytesWritable); tuple = makeTuple(bytesWritable); totalBytes += (long) bytesWritable.getBytes().length; } else { Text text = new Text(); reader.getCurrentValue(text); cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap, schema.getColumns().size()); totalBytes += (long) text.getBytes().length; tuple = new LazyTuple(schema, cells, 0, nullChars, serde); } currentIdx++; return tuple; } else { return null; } }
From source file:org.apache.tajo.storage.sequencefile.SequenceFileScanner.java
License:Apache License
/** * In hive, LazyBinarySerDe is serialized as follows: start A B A B A B end bytes[] -> * |-----|---------|--- ... ---|-----|---------| * * Section A is one null-byte, corresponding to eight struct fields in Section * B. Each bit indicates whether the corresponding field is null (0) or not null * (1). Each field is a LazyBinaryObject. * * Following B, there is another section A and B. This pattern repeats until the * all struct fields are serialized.// w w w.j a v a2 s . c o m * * So, tajo must make a tuple after parsing hive style BinarySerDe. */ private Tuple makeTuple(BytesWritable value) throws IOException { Tuple tuple = new VTuple(schema.getColumns().size()); int start = 0; int length = value.getLength(); /** * Please note that one null byte is followed by eight fields, then more * null byte and fields. */ int structByteEnd = start + length; byte[] bytes = value.getBytes(); byte nullByte = bytes[start]; int lastFieldByteEnd = start + 1; // Go through all bytes in the byte[] for (int i = 0; i < schema.getColumns().size(); i++) { fieldIsNull[i] = true; if ((nullByte & (1 << (i % 8))) != 0) { fieldIsNull[i] = false; parse(schema.getColumn(i), bytes, lastFieldByteEnd); fieldStart[i] = lastFieldByteEnd + elementOffset; fieldLength[i] = elementSize; lastFieldByteEnd = fieldStart[i] + fieldLength[i]; for (int j = 0; j < projectionMap.length; j++) { if (projectionMap[j] == i) { Datum datum = serde.deserialize(schema.getColumn(i), bytes, fieldStart[i], fieldLength[i], nullChars); tuple.put(i, datum); } } } // next byte is a null byte if there are more bytes to go if (7 == (i % 8)) { if (lastFieldByteEnd < structByteEnd) { nullByte = bytes[lastFieldByteEnd]; lastFieldByteEnd++; } else { // otherwise all null afterwards nullByte = 0; lastFieldByteEnd++; } } } return tuple; }
From source file:org.apache.tez.runtime.library.common.comparator.TezBytesComparator.java
License:Apache License
@Override public int getProxy(BytesWritable key) { int prefix = 0; final int len = key.getLength(); final byte[] content = key.getBytes(); int b1 = 0, b2 = 0, b3 = 0; switch (len) { default:// w ww .ja va 2 s . c o m case 3: b3 = content[2] & 0xff; case 2: b2 = content[1] & 0xff; case 1: b1 = content[0] & 0xff; case 0: } prefix = (b1 << 16) | (b2 << 8) | (b3); return prefix; }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... Text key = new Text(); BytesWritable value = new BytesWritable(); while (reader.next(key, value)) { TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);//from ww w .j a va2s .c o m // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { JobConf conf = new JobConf(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {/* w ww.j ava 2 s .c o m*/ // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(conf, split); int index = 0; // iterate and validate stuff ... Text key = reader.createKey(); BytesWritable value = reader.createValue(); while (reader.next(key, value)) { TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
License:Apache License
static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);// w w w . j a v a 2s . co m // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { Configuration conf = new Configuration(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {//from ww w . jav a2s. c o m // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID())); int index = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }
From source file:org.commoncrawl.util.shared.ARCFileReader.java
License:Apache License
public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException { Configuration conf = new Configuration(); String path = null;//w w w . java 2s . c o m CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine cmdLine = parser.parse(options, args); // get ARCFile Path path = cmdLine.getOptionValue("file"); // get optional config if (cmdLine.hasOption("conf")) { conf.addResource(new Path(cmdLine.getOptionValue("conf"))); } if (cmdLine.hasOption("awsAccessKey")) { conf.set("fs.s3n.awsAccessKeyId", cmdLine.getOptionValue("awsAccessKey")); } if (cmdLine.hasOption("awsSecret")) { conf.set("fs.s3n.awsSecretAccessKey", cmdLine.getOptionValue("awsSecret")); } } catch (ParseException e) { System.out.println(e.toString()); printUsage(); System.exit(1); } final URI uri = new URI(path); FileSystem fs = FileSystem.get(uri, conf); // byte data[] = new byte[4096*10]; // int readAmt = 0; // while ((readAmt = stream.get().read(data)) != -1) { // System.out.println(HexDump.dumpHexString(data, 0, readAmt)); // } // stream.get().close(); // System.exit(1); ARCFileReader reader = null; try { System.out.println("Initializing Reader for Path:" + uri); reader = new ARCFileReader(fs.open(new Path(path))); Text key = new Text(); BytesWritable value = new BytesWritable(); while (reader.hasMoreItems()) { reader.nextKeyValue(key, value); int indexOfTrailingCRLF = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); int headerLen = indexOfTrailingCRLF + 4; int contentLen = value.getLength() - headerLen; String outputStr = "Key:" + key.toString() + " HeaderLen:" + headerLen + " ContentLen:" + contentLen; System.out.println(outputStr); //String contentStr = new String(value.getBytes(),headerLen,contentLen,Charset.forName("ASCII")); //System.out.println(contentStr.substring(contentStr.length() - 20)); } System.out.println("Exiting Loop"); } catch (Exception e) { System.out.println(CCStringUtils.stringifyException(e)); LOG.error(CCStringUtils.stringifyException(e)); //throw new IOException(e); } finally { if (reader != null) { System.out.println("***Closing Reader"); reader.close(); } } }
From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java
License:Apache License
/** * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... *///from w ww.jav a 2 s . c o m @Test public void testReader() { DataOutputBuffer os = new DataOutputBuffer(); long timestamp = System.currentTimeMillis(); try { // write the ARC File into memory writeFirstRecord(os, "test", timestamp); List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { NIOHttpHeaders headers = new NIOHttpHeaders(); for (int i = 0; i < record.headers.size(); ++i) { headers.set(record.headers.get(i).e0, record.headers.get(i).e1); } write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); os.close(); final AtomicBoolean streamClosed = new AtomicBoolean(); // setup ArcFileReader to read the file InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) { public synchronized int read(byte b[], int off, int len) { len = 1; return super.read(b, off, len); } public void close() throws IOException { super.close(); streamClosed.set(true); } }; ARCFileReader reader = new ARCFileReader(in); int index = 0; Text key = new Text(); BytesWritable value = new BytesWritable(); // iterate and validate stuff ... while (reader.hasMoreItems()) { reader.nextKeyValue(key, value); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue( compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); if (indexofHeaderTerminator == -1) { throw new IOException("No Header Terminator found in Value!"); } indexofHeaderTerminator += 4; // read headers ... String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator, Charset.forName("UTF-8")); NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText); for (int i = 0; i < testRecord.headers.size(); ++i) { Pair<String, String> testHeaderRecord = testRecord.headers.get(i); Assert.assertNotNull(headers.findValue(testHeaderRecord.e0)); Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0)); } Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT); Assert.assertTrue(streamClosed.get()); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } }