Example usage for org.apache.hadoop.io Text getBytes

List of usage examples for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java

License:Apache License

static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ...
    while (reader.nextKeyValue()) {

        Text key = reader.getCurrentKey();
        ArcFileItem value = reader.getCurrentValue();

        TestRecord testRecord = records.get(itemIndex++);

        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);/* w ww . j ava 2s .  c  o m*/
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getContent().getReadOnlyBytes(), value.getContent().getOffset(),
                value.getContent().getCount()) == 0);
        NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
        // validate metadata 
        Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
        Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos);
        Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize);
        Assert.assertEquals("test-value", headers.findValue("test"));
        Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName());

    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    Configuration conf = new Configuration();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {// ww  w  .  j  ava2 s.  c  om
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID()));

    int index = 0;

    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}

From source file:org.commoncrawl.util.JoinValue.java

License:Open Source License

public JoinValue(TextBytes tag, Text value) {
    _tag = tag;/*from ww  w .  j a  v  a2 s . c  om*/
    _type = TEXT_TYPE_JOIN_VALUE;
    _textValue = new TextBytes();
    _textValue.set(value.getBytes(), 0, value.getLength());
}

From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java

License:Apache License

/** 
 * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... 
 *///from  w w w .j  ava2s  .com
@Test
public void testReader() {
    DataOutputBuffer os = new DataOutputBuffer();
    long timestamp = System.currentTimeMillis();
    try {
        // write the ARC File into memory 
        writeFirstRecord(os, "test", timestamp);
        List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            NIOHttpHeaders headers = new NIOHttpHeaders();
            for (int i = 0; i < record.headers.size(); ++i) {
                headers.set(record.headers.get(i).e0, record.headers.get(i).e1);
            }

            write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html",
                    MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
        }
        os.flush();
        os.close();

        final AtomicBoolean streamClosed = new AtomicBoolean();
        // setup ArcFileReader to read the file 
        InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) {

            public synchronized int read(byte b[], int off, int len) {
                len = 1;
                return super.read(b, off, len);
            }

            public void close() throws IOException {
                super.close();
                streamClosed.set(true);
            }
        };
        ARCFileReader reader = new ARCFileReader(in);
        int index = 0;
        Text key = new Text();
        BytesWritable value = new BytesWritable();

        // iterate and validate stuff ... 
        while (reader.hasMoreItems()) {
            reader.nextKeyValue(key, value);
            TestRecord testRecord = records.get(index++);
            // get test key bytes as utf-8 bytes ... 
            byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
            // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
            // with ?, which causes our test case (which does use invalid characters to from the key, to break.
            Assert.assertTrue(
                    compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);
            // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
            // we search for this specific byte pattern to locate start of content, then compare it against source ... 
            int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            if (indexofHeaderTerminator == -1) {
                throw new IOException("No Header Terminator found in Value!");
            }
            indexofHeaderTerminator += 4;
            // read headers ... 
            String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator,
                    Charset.forName("UTF-8"));
            NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
            for (int i = 0; i < testRecord.headers.size(); ++i) {
                Pair<String, String> testHeaderRecord = testRecord.headers.get(i);
                Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
                Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0));
            }

            Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(),
                    indexofHeaderTerminator, testRecord.data.length) == 0);
        }
        reader.close();

        Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT);
        Assert.assertTrue(streamClosed.get());
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:org.commoncrawl.util.TextBytes.java

License:Open Source License

/** copy a text. */
public void set(Text other) {
    set(other.getBytes(), 0, other.getLength());
}

From source file:org.commoncrawl.util.URLUtils.java

License:Open Source License

public static String getHostNameFromURLKey(Text key) {

    fastGetResult result = fastGetHostFromTextURL(key.getBytes(), 0, key.getLength());

    if (result != null && result.length != 0) {
        String hostName = new String(key.getBytes(), result.offset, result.length);
        return hostName;
    }/*from ww w.  j  a v a2s.c  o m*/
    return null;
}

From source file:org.culturegraph.mf.cluster.job.merge.ResultMapper.java

License:Apache License

@Override
public void map(final Text tag, final TextArrayWritable members, final Context context)
        throws IOException, InterruptedException {

    if (tag.equals(Union.OPEN)) {
        return;//from ww w. j a  v  a2  s.  c om
    }

    memberSet.clear();
    members.copyTo(memberSet);
    final Text representative = memberSet.pollFirst();
    for (Text member : memberSet) {
        final Put put = new Put(member.getBytes());
        put.add(Column.Family.PROPERTY, REDIRECT, representative.getBytes());
        htable.put(put);
    }
    context.getCounter(Union.UNION_FIND, "redirects written").increment(memberSet.size());
}

From source file:org.elasticsearch.hadoop.mr.MapReduceWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public boolean write(Writable writable, Generator generator) {
    if (writable == null || writable instanceof NullWritable) {
        generator.writeNull();//w  ww  .j ava2s .  c om
    } else if (writable instanceof Text) {
        Text text = (Text) writable;
        generator.writeUTF8String(text.getBytes(), 0, text.getLength());
    } else if (writable instanceof UTF8) {
        UTF8 utf8 = (UTF8) writable;
        generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength());
    } else if (writable instanceof IntWritable) {
        generator.writeNumber(((IntWritable) writable).get());
    } else if (writable instanceof LongWritable) {
        generator.writeNumber(((LongWritable) writable).get());
    } else if (writable instanceof VLongWritable) {
        generator.writeNumber(((VLongWritable) writable).get());
    } else if (writable instanceof VIntWritable) {
        generator.writeNumber(((VIntWritable) writable).get());
    } else if (writable instanceof ByteWritable) {
        generator.writeNumber(((ByteWritable) writable).get());
    } else if (writable instanceof DoubleWritable) {
        generator.writeNumber(((DoubleWritable) writable).get());
    } else if (writable instanceof FloatWritable) {
        generator.writeNumber(((FloatWritable) writable).get());
    } else if (writable instanceof BooleanWritable) {
        generator.writeBoolean(((BooleanWritable) writable).get());
    } else if (writable instanceof BytesWritable) {
        BytesWritable bw = (BytesWritable) writable;
        generator.writeBinary(bw.getBytes(), 0, bw.getLength());
    } else if (writable instanceof MD5Hash) {
        generator.writeString(writable.toString());
    }

    else if (writable instanceof ArrayWritable) {
        generator.writeBeginArray();
        for (Writable wrt : ((ArrayWritable) writable).get()) {
            if (!write(wrt, generator)) {
                return false;
            }
        }
        generator.writeEndArray();
    }

    else if (writable instanceof AbstractMapWritable) {
        Map<Writable, Writable> map = (Map<Writable, Writable>) writable;

        generator.writeBeginObject();
        // ignore handling sets (which are just maps with null values)
        for (Entry<Writable, Writable> entry : map.entrySet()) {
            generator.writeFieldName(entry.getKey().toString());
            if (!write(entry.getValue(), generator)) {
                return false;
            }
        }
        generator.writeEndObject();
    } else {
        if (writeUnknownTypes) {
            return handleUnknown(writable, generator);
        }
        return false;
    }
    return true;
}

From source file:org.elasticsearch.hadoop.mr.SafeWritableConverter.java

License:Apache License

public void invoke(Object from, BytesArray to) {
    // handle common cases
    if (from instanceof Text) {
        Text t = (Text) from;
        to.bytes(t.getBytes(), t.getLength());
    }/* ww  w .ja v a  2s  .  c  o  m*/
    if (from instanceof BytesWritable) {
        BytesWritable b = (BytesWritable) from;
        to.bytes(b.getBytes(), b.getLength());
    }
}

From source file:org.elasticsearch.hadoop.mr.WritableBytesConverter.java

License:Apache License

@Override
public void convert(Object from, BytesArray to) {
    // handle common cases
    if (from instanceof Text) {
        Text t = (Text) from;
        to.bytes(t.getBytes(), t.getLength());
        return;/*from ww w  .jav a 2  s .c o  m*/
    }
    if (from instanceof BytesWritable) {
        BytesWritable b = (BytesWritable) from;
        to.bytes(b.getBytes(), b.getLength());
        return;
    }

    super.convert(from, to);
}