Example usage for org.apache.hadoop.io Text getBytes

List of usage examples for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:org.cloudata.examples.web.TermUploadJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]");
        System.exit(0);//from   w  w w  . ja v  a 2s. c o  m
    }
    JobConf jobConf = new JobConf(TermUploadJob.class);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2;
    if (options.length > 1) {
        maxReduce = Integer.parseInt(options[1]);
    }

    jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000);

    FileSystem fs = FileSystem.get(jobConf);

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, TERM_TABLE)) {
        //Table  
        Path path = new Path("blogdata/tmp/weight");
        FileStatus[] paths = fs.listStatus(path);
        if (paths == null || paths.length == 0) {
            LOG.error("No Partition info:" + path);
            return;
        }
        SortedSet<Text> terms = new TreeSet<Text>();
        Text text = new Text();
        for (FileStatus eachPath : paths) {
            CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath()));
            while (true) {
                int length = reader.readLine(text);
                if (length <= 0) {
                    break;
                }
                terms.add(new Text(text));
            }
        }

        int temrsPerTablet = terms.size() / (maxReduce - 1);
        int count = 0;
        List<Row.Key> rowKeys = new ArrayList<Row.Key>();
        for (Text term : terms) {
            count++;
            if (count == temrsPerTablet) {
                rowKeys.add(new Row.Key(term.getBytes()));
                count = 0;
            }
        }
        rowKeys.add(Row.Key.MAX_KEY);

        TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS);
        CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {}));
    }
    CTable termTable = CTable.openTable(nconf, TERM_TABLE);
    TabletInfo[] tabletInfos = termTable.listTabletInfos();

    Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis());

    jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")");
    FileInputFormat.addInputPath(jobConf, new Path(options[0]));

    //<MAP>
    jobConf.setMapperClass(TermUploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE);
    jobConf.setPartitionerClass(WebKeyRangePartitioner.class);
    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermUploadReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setNumReduceTasks(tabletInfos.length);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(maxReduce);
    jobConf.setMaxReduceAttempts(0);
    //<REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    fs.delete(tempOutputPath);
}

From source file:org.cloudata.examples.web.TermUploadMap.java

License:Apache License

public void map(WritableComparable key, Writable value, OutputCollector<WritableComparable, Writable> collector,
        Reporter reporter) throws IOException {

    collector.collect((Text) value, new Text(""));
    count++;//  www .  ja  v a 2  s .c  o  m
    if (count % 50000 == 0) {
        Text tValue = (Text) value;
        String keyStr = new String(tValue.getBytes(), 0, tValue.getLength(), "EUC-KR");
        System.out.println(keyStr);
    }
}

From source file:org.cloudata.examples.web.TermUploadReduce.java

License:Apache License

public void reduce(WritableComparable key, Iterator<Writable> values,
        OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException {
    if (exception != null) {
        throw exception;
    }//from   ww w.j  av a2s  . c  o  m
    Text tKey = (Text) key;
    int keyIndex = tKey.find("\t");
    if (keyIndex < 0) {
        LOG.error("invalid value:" + tKey);
        return;
    }

    Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, keyIndex);

    String keyStr = new String(tKey.getBytes(), keyIndex + 1, (tKey.getLength() - keyIndex - 1), "EUC-KR");

    //term, ?(tf), documentId url, freq, weight
    //term, ?(df), df
    String[] valueTokens = keyStr.split("\t");

    if (rowKey.getLength() < TestWebPage.MIN_TERM_LENGTH) {
        return;
    }

    count++;
    if (count % 50000 == 0) {
        System.out.println(new Date() + ":" + keyStr);
    }

    if (valueTokens.length == 2 && "df".equals(valueTokens[0])) {
        Row row = new Row(rowKey);
        row.addCell("df", new Cell(Cell.Key.EMPTY_KEY, valueTokens[1].getBytes()));
        dfUploader.put(row);
    } else if (valueTokens.length == 4 && "tf".equals(valueTokens[0])) {
        Row row = new Row(rowKey);
        String documentId = valueTokens[1];
        String freq = valueTokens[2];
        String weight = valueTokens[3];

        row.addCell("tf", new Cell(new Cell.Key(documentId), freq.getBytes()));
        row.addCell("weight", new Cell(new Cell.Key(documentId), weight.getBytes()));

        byte[] documentIdBytes = documentId.getBytes();

        row.addCell("i_weight",
                new Cell(new Cell.Key((df.format(1.0 - Double.parseDouble(weight)) + documentId).getBytes()),
                        documentIdBytes));

        weightUploader.put(row);
    } else {
        LOG.error("invalid value:" + valueTokens.length + "," + count + "," + valueTokens[1] + "," + keyStr);
        return;
    }
}

From source file:org.cloudata.examples.web.TermWeightReduce.java

License:Apache License

public void reduce(WritableComparable key, Iterator<Writable> values,
        OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException {
    if (exception != null) {
        throw exception;
    }//from  ww w .  j a v  a 2s .  c om
    //key: term, value: documentId , freq, docLength
    Text tKey = (Text) key;
    Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength());
    String keyStr = new String(tKey.getBytes(), 0, tKey.getLength(), "EUC-KR");

    if (tKey.getLength() == 0 || keyStr.trim().length() < TestWebPage.MIN_TERM_LENGTH) {
        return;
    }

    List<Object[]> termFreqs = new ArrayList<Object[]>(100);
    Set<String> docs = new HashSet<String>();

    while (values.hasNext()) {
        Text tValue = (Text) values.next();
        String valueStr = tValue.toString();
        String[] valueTokens = valueStr.split("\t");
        if (valueTokens.length < 3) {
            LOG.error("valueTokens != 3:" + valueStr);
            return;
        }
        String documentId = valueTokens[0];
        int freq = Integer.parseInt(valueTokens[1]);
        long docLength = Long.parseLong(valueTokens[2]);
        docs.add(documentId);

        termFreqs.add(new Object[] { documentId, freq, docLength });
        if (termFreqs.size() > 100000) {
            LOG.info("Too many tf:term=" + keyStr);
            break;
        }
    }
    int numOfdocument = docs.size();

    for (Object[] eachValue : termFreqs) {
        String documentId = (String) eachValue[0];
        int freq = (Integer) eachValue[1];
        long docLength = (Long) eachValue[2];

        double weight = getTermWeight(freq, docLength, avgDocLength, sumDocCount, numOfdocument);

        collector.collect(tKey,
                new Text("tf\t" + documentId + "\t" + String.valueOf(freq) + "\t" + df.format(weight)));
        termCount++;
        if (termCount % 100000 == 0) {
            System.out.println("term=" + keyStr + ",document=" + documentId + ",freq=" + freq + ",df="
                    + numOfdocument + "weight=" + df.format(weight));
        }
    }
    collector.collect(tKey, new Text("df\t" + numOfdocument));

    if (termCount % 100 == 0) {
        partitionOut.write(tKey.getBytes());
        partitionOut.write("\n".getBytes());
    }
}

From source file:org.cloudata.examples.web.TermWeightReduceOnline.java

License:Apache License

public void reduce(WritableComparable key, Iterator<Writable> values,
        OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException {
    if (exception != null) {
        throw exception;
    }//  w  w w .j av  a2 s . c o m
    //key: term, value: documentId , freq, docLength
    Text tKey = (Text) key;
    Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength());
    String keyStr = new String(tKey.getBytes(), 0, tKey.getLength(), "EUC-KR");

    if (tKey.getLength() == 0 || keyStr.trim().length() < TestWebPage.MIN_TERM_LENGTH) {
        return;
    }

    Row row = termTable.get(rowKey, "df");
    if (row == null || row.getColumnSize() == 0) {
        LOG.error("No df for term:" + keyStr);
        return;
    }

    int docFreq = row.getOne("df").getValue().getValueAsInt();

    Row iRow = new Row(rowKey);
    int count = 0;
    List<ColumnValue> tfColumnValues = new ArrayList<ColumnValue>();
    List<ColumnValue> weightColumnValues = new ArrayList<ColumnValue>();
    List<ColumnValue> iWeightColumnValues = new ArrayList<ColumnValue>();

    while (values.hasNext()) {
        Text tValue = (Text) values.next();
        String valueStr = tValue.toString();
        String[] valueTokens = valueStr.split("\t");
        if (valueTokens.length < 3) {
            LOG.error("valueTokens != 3:" + valueStr);
            return;
        }
        String documentId = valueTokens[0];
        int freq = Integer.parseInt(valueTokens[1]);
        long docLength = Long.parseLong(valueTokens[2]);

        double weight = getTermWeight(freq, docLength, avgDocLength, sumDocCount, docFreq);

        iRow.addCell("tf", new Cell(new Cell.Key(documentId), Integer.toString(freq).getBytes()));
        iRow.addCell("weigth", new Cell(new Cell.Key(documentId), Integer.toString(freq).getBytes()));

        byte[] documentIdBytes = documentId.getBytes();
        iRow.addCell("i_weight",
                new Cell(new Cell.Key((df.format(1.0 - weight) + documentId).getBytes()), documentIdBytes));

        if (termCount % 100000 == 0) {
            System.out.println("term=" + keyStr + ",document=" + documentId + ",freq=" + freq + ",df=" + docFreq
                    + "weight=" + df.format(weight));
        }
        termCount++;

        count++;
        if (count % 500 == 0) {
            try {
                termTable.put(iRow);
            } catch (Exception e) {
                LOG.error(e);
            }
        }
        iRow = new Row(rowKey);
    }
    try {
        termTable.put(iRow);
    } catch (Exception e) {
        LOG.error(e);
    }
}

From source file:org.cloudata.examples.web.WebKeyRangePartitioner.java

License:Apache License

public int getPartition(WritableComparable key, Writable value, int numPartitions) {
    if (confException != null) {
        LOG.error(confException.getMessage(), confException);
        return -1;
    }// w ww  . j a v  a 2s  .c o  m

    if (numPartitions != tabletInfoSet.size()) {
        LOG.error(
                "tablet count(" + tabletInfoSet.size() + ") not equals numPartitions (" + numPartitions + ")");
        return -1;
    }

    if (tabletInfoSet.size() == 0) {
        LOG.error("tablet partition size is zero");
        return -1;
    }
    int partitionNumber = 0;
    Text tKey = (Text) key;

    Row.Key rowKey;

    int keyIndex = tKey.find("\t");
    if (keyIndex < 0) {
        LOG.error("invalid value:" + tKey);
        rowKey = Row.Key.MAX_KEY;
    } else {
        rowKey = new Row.Key(tKey.getBytes(), 0, keyIndex);
    }

    SortedSet<RowKeyItem> tailSet = tabletInfoSet.tailSet(new RowKeyItem(rowKey, 0));
    RowKeyItem item = null;
    if (tailSet.size() > 0) {
        item = tailSet.first();
        partitionNumber = item.index;
    } else {
        item = tabletInfoSet.last();
        partitionNumber = item.index;
    }

    if (partitionNumber >= numPartitions) {
        LOG.info("Partition Number is : " + partitionNumber + ", numPartitions : " + numPartitions
                + ", Row.Key : " + key.toString());
        partitionNumber = numPartitions - 1;
    }
    //LOG.info("tablet partition num:" + partitionNumber);
    count++;
    if (count % 5000 == 0) {
        try {
            System.out.println("Partitioned:" + new String(rowKey.getBytes(), "EUC-KR") + ","
                    + new String(item.rowKey.getBytes(), "EUC-KR"));
        } catch (UnsupportedEncodingException e) {

        }
    }
    return partitionNumber;
}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ...
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(itemIndex++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);/*from  ww  w. j  av a2 s. co m*/
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ...
    Text key = new Text();
    ArcFileItem value = new ArcFileItem();
    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(itemIndex++);

        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);//from  w w w.j  av  a2s  . com
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getContent().getReadOnlyBytes(), value.getContent().getOffset(),
                value.getContent().getCount()) == 0);
        NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
        // validate metadata 
        Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
        Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos);
        Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize);
        Assert.assertEquals("test-value", headers.findValue("test"));
        Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName());

    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    JobConf conf = new JobConf();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {/*w  w  w  .j  av  a  2 s. c o m*/
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(conf, split);

    int index = 0;

    // iterate and validate stuff ... 
    Text key = reader.createKey();
    BytesWritable value = reader.createValue();

    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java

License:Apache License

static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(itemIndex++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);//from   w  ww . j  a va  2s.c  om
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}