Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:org.cloudata.examples.web.TermUploadMap.java

License:Apache License

public void map(WritableComparable key, Writable value, OutputCollector<WritableComparable, Writable> collector,
        Reporter reporter) throws IOException {

    collector.collect((Text) value, new Text(""));
    count++;/*from w w  w  .j  a v  a 2s .  co  m*/
    if (count % 50000 == 0) {
        Text tValue = (Text) value;
        String keyStr = new String(tValue.getBytes(), 0, tValue.getLength(), "EUC-KR");
        System.out.println(keyStr);
    }
}

From source file:org.cloudata.examples.web.TermUploadReduce.java

License:Apache License

public void reduce(WritableComparable key, Iterator<Writable> values,
        OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException {
    if (exception != null) {
        throw exception;
    }/* w w  w.ja  v  a 2  s .com*/
    Text tKey = (Text) key;
    int keyIndex = tKey.find("\t");
    if (keyIndex < 0) {
        LOG.error("invalid value:" + tKey);
        return;
    }

    Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, keyIndex);

    String keyStr = new String(tKey.getBytes(), keyIndex + 1, (tKey.getLength() - keyIndex - 1), "EUC-KR");

    //term, ?(tf), documentId url, freq, weight
    //term, ?(df), df
    String[] valueTokens = keyStr.split("\t");

    if (rowKey.getLength() < TestWebPage.MIN_TERM_LENGTH) {
        return;
    }

    count++;
    if (count % 50000 == 0) {
        System.out.println(new Date() + ":" + keyStr);
    }

    if (valueTokens.length == 2 && "df".equals(valueTokens[0])) {
        Row row = new Row(rowKey);
        row.addCell("df", new Cell(Cell.Key.EMPTY_KEY, valueTokens[1].getBytes()));
        dfUploader.put(row);
    } else if (valueTokens.length == 4 && "tf".equals(valueTokens[0])) {
        Row row = new Row(rowKey);
        String documentId = valueTokens[1];
        String freq = valueTokens[2];
        String weight = valueTokens[3];

        row.addCell("tf", new Cell(new Cell.Key(documentId), freq.getBytes()));
        row.addCell("weight", new Cell(new Cell.Key(documentId), weight.getBytes()));

        byte[] documentIdBytes = documentId.getBytes();

        row.addCell("i_weight",
                new Cell(new Cell.Key((df.format(1.0 - Double.parseDouble(weight)) + documentId).getBytes()),
                        documentIdBytes));

        weightUploader.put(row);
    } else {
        LOG.error("invalid value:" + valueTokens.length + "," + count + "," + valueTokens[1] + "," + keyStr);
        return;
    }
}

From source file:org.cloudata.examples.web.TermWeightReduce.java

License:Apache License

public void reduce(WritableComparable key, Iterator<Writable> values,
        OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException {
    if (exception != null) {
        throw exception;
    }//from   w ww .  java  2s .c om
    //key: term, value: documentId , freq, docLength
    Text tKey = (Text) key;
    Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength());
    String keyStr = new String(tKey.getBytes(), 0, tKey.getLength(), "EUC-KR");

    if (tKey.getLength() == 0 || keyStr.trim().length() < TestWebPage.MIN_TERM_LENGTH) {
        return;
    }

    List<Object[]> termFreqs = new ArrayList<Object[]>(100);
    Set<String> docs = new HashSet<String>();

    while (values.hasNext()) {
        Text tValue = (Text) values.next();
        String valueStr = tValue.toString();
        String[] valueTokens = valueStr.split("\t");
        if (valueTokens.length < 3) {
            LOG.error("valueTokens != 3:" + valueStr);
            return;
        }
        String documentId = valueTokens[0];
        int freq = Integer.parseInt(valueTokens[1]);
        long docLength = Long.parseLong(valueTokens[2]);
        docs.add(documentId);

        termFreqs.add(new Object[] { documentId, freq, docLength });
        if (termFreqs.size() > 100000) {
            LOG.info("Too many tf:term=" + keyStr);
            break;
        }
    }
    int numOfdocument = docs.size();

    for (Object[] eachValue : termFreqs) {
        String documentId = (String) eachValue[0];
        int freq = (Integer) eachValue[1];
        long docLength = (Long) eachValue[2];

        double weight = getTermWeight(freq, docLength, avgDocLength, sumDocCount, numOfdocument);

        collector.collect(tKey,
                new Text("tf\t" + documentId + "\t" + String.valueOf(freq) + "\t" + df.format(weight)));
        termCount++;
        if (termCount % 100000 == 0) {
            System.out.println("term=" + keyStr + ",document=" + documentId + ",freq=" + freq + ",df="
                    + numOfdocument + "weight=" + df.format(weight));
        }
    }
    collector.collect(tKey, new Text("df\t" + numOfdocument));

    if (termCount % 100 == 0) {
        partitionOut.write(tKey.getBytes());
        partitionOut.write("\n".getBytes());
    }
}

From source file:org.cloudata.examples.web.TermWeightReduceOnline.java

License:Apache License

public void reduce(WritableComparable key, Iterator<Writable> values,
        OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException {
    if (exception != null) {
        throw exception;
    }/*  w w  w .j  a  v a 2 s .c om*/
    //key: term, value: documentId , freq, docLength
    Text tKey = (Text) key;
    Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength());
    String keyStr = new String(tKey.getBytes(), 0, tKey.getLength(), "EUC-KR");

    if (tKey.getLength() == 0 || keyStr.trim().length() < TestWebPage.MIN_TERM_LENGTH) {
        return;
    }

    Row row = termTable.get(rowKey, "df");
    if (row == null || row.getColumnSize() == 0) {
        LOG.error("No df for term:" + keyStr);
        return;
    }

    int docFreq = row.getOne("df").getValue().getValueAsInt();

    Row iRow = new Row(rowKey);
    int count = 0;
    List<ColumnValue> tfColumnValues = new ArrayList<ColumnValue>();
    List<ColumnValue> weightColumnValues = new ArrayList<ColumnValue>();
    List<ColumnValue> iWeightColumnValues = new ArrayList<ColumnValue>();

    while (values.hasNext()) {
        Text tValue = (Text) values.next();
        String valueStr = tValue.toString();
        String[] valueTokens = valueStr.split("\t");
        if (valueTokens.length < 3) {
            LOG.error("valueTokens != 3:" + valueStr);
            return;
        }
        String documentId = valueTokens[0];
        int freq = Integer.parseInt(valueTokens[1]);
        long docLength = Long.parseLong(valueTokens[2]);

        double weight = getTermWeight(freq, docLength, avgDocLength, sumDocCount, docFreq);

        iRow.addCell("tf", new Cell(new Cell.Key(documentId), Integer.toString(freq).getBytes()));
        iRow.addCell("weigth", new Cell(new Cell.Key(documentId), Integer.toString(freq).getBytes()));

        byte[] documentIdBytes = documentId.getBytes();
        iRow.addCell("i_weight",
                new Cell(new Cell.Key((df.format(1.0 - weight) + documentId).getBytes()), documentIdBytes));

        if (termCount % 100000 == 0) {
            System.out.println("term=" + keyStr + ",document=" + documentId + ",freq=" + freq + ",df=" + docFreq
                    + "weight=" + df.format(weight));
        }
        termCount++;

        count++;
        if (count % 500 == 0) {
            try {
                termTable.put(iRow);
            } catch (Exception e) {
                LOG.error(e);
            }
        }
        iRow = new Row(rowKey);
    }
    try {
        termTable.put(iRow);
    } catch (Exception e) {
        LOG.error(e);
    }
}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ...
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(itemIndex++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;//from w w w  .  j  a  va 2  s .c o m
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ...
    Text key = new Text();
    ArcFileItem value = new ArcFileItem();
    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(itemIndex++);

        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getContent().getReadOnlyBytes(), value.getContent().getOffset(),
                value.getContent().getCount()) == 0);
        NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
        // validate metadata 
        Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
        Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos);
        Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize);
        Assert.assertEquals("test-value", headers.findValue("test"));
        Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName());

    }//from w  w w . j a  v a 2s . c om
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    JobConf conf = new JobConf();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {/*w ww  . j a  v  a 2s  .  com*/
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(conf, split);

    int index = 0;

    // iterate and validate stuff ... 
    Text key = reader.createKey();
    BytesWritable value = reader.createValue();

    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java

License:Apache License

static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(itemIndex++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;/*from www . ja  va 2 s. c o  m*/
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java

License:Apache License

static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ...
    while (reader.nextKeyValue()) {

        Text key = reader.getCurrentKey();
        ArcFileItem value = reader.getCurrentValue();

        TestRecord testRecord = records.get(itemIndex++);

        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getContent().getReadOnlyBytes(), value.getContent().getOffset(),
                value.getContent().getCount()) == 0);
        NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
        // validate metadata 
        Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
        Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos);
        Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize);
        Assert.assertEquals("test-value", headers.findValue("test"));
        Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName());

    }//w  ww. java2  s  .  c  om
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    Configuration conf = new Configuration();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {/* w w  w  .j ava  2s  .c o m*/
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID()));

    int index = 0;

    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}