Example usage for org.apache.hadoop.io Text getBytes

List of usage examples for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:com.kylinolap.job.hadoop.cube.FactDistinctColumnsMapper.java

License:Apache License

@Override
public void map(KEYIN key, Text value, Context context) throws IOException, InterruptedException {

    try {//from w ww  . j a v a 2  s .  c o  m
        bytesSplitter.split(value.getBytes(), value.getLength(), byteRowDelimiter);
        intermediateTableDesc.sanityCheck(bytesSplitter);
        SplittedBytes[] splitBuffers = bytesSplitter.getSplitBuffers();

        int[] flatTableIndexes = intermediateTableDesc.getRowKeyColumnIndexes();
        for (int i : factDictCols) {
            outputKey.set((short) i);
            SplittedBytes bytes = splitBuffers[flatTableIndexes[i]];
            outputValue.set(bytes.value, 0, bytes.length);
            context.write(outputKey, outputValue);
        }
    } catch (Exception ex) {
        handleErrorRecord(bytesSplitter, ex);
    }

}

From source file:com.kylinolap.job.hadoop.cube.FactDistinctColumnsReducer.java

License:Apache License

@Override
public void reduce(ShortWritable key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    TblColRef col = columnList.get(key.get());

    HashSet<ByteArray> set = new HashSet<ByteArray>();
    for (Text textValue : values) {
        ByteArray value = new ByteArray(Bytes.copy(textValue.getBytes(), 0, textValue.getLength()));
        set.add(value);//from w w  w .  j  a  v a  2  s .  c o m
    }

    Configuration conf = context.getConfiguration();
    FileSystem fs = FileSystem.get(conf);
    String outputPath = conf.get(BatchConstants.OUTPUT_PATH);
    FSDataOutputStream out = fs.create(new Path(outputPath, col.getName()));

    try {
        for (ByteArray value : set) {
            out.write(value.data);
            out.write('\n');
        }
    } finally {
        out.close();
    }

}

From source file:com.kylinolap.job.hadoop.cube.MergeCuboidMapper.java

License:Apache License

@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    long cuboidID = rowKeySplitter.split(key.getBytes(), key.getBytes().length);
    Cuboid cuboid = Cuboid.findById(cubeDesc, cuboidID);

    SplittedBytes[] splittedByteses = rowKeySplitter.getSplitBuffers();
    int bufOffset = 0;
    BytesUtil.writeLong(cuboidID, newKeyBuf, bufOffset, RowConstants.ROWKEY_CUBOIDID_LEN);
    bufOffset += RowConstants.ROWKEY_CUBOIDID_LEN;

    for (int i = 0; i < cuboid.getColumns().size(); ++i) {
        TblColRef col = cuboid.getColumns().get(i);

        if (this.checkNeedMerging(col)) {
            // if dictionary on fact table column, needs rewrite
            DictionaryManager dictMgr = DictionaryManager.getInstance(config);
            Dictionary<?> sourceDict = dictMgr.getDictionary(sourceCubeSegment.getDictResPath(col));
            Dictionary<?> mergedDict = dictMgr.getDictionary(mergedCubeSegment.getDictResPath(col));

            while (sourceDict.getSizeOfValue() > newKeyBuf.length - bufOffset
                    || mergedDict.getSizeOfValue() > newKeyBuf.length - bufOffset) {
                byte[] oldBuf = newKeyBuf;
                newKeyBuf = new byte[2 * newKeyBuf.length];
                System.arraycopy(oldBuf, 0, newKeyBuf, 0, oldBuf.length);
            }/*w ww .j ava2  s .c om*/

            int idInSourceDict = BytesUtil.readUnsigned(splittedByteses[i + 1].value, 0,
                    splittedByteses[i + 1].length);
            int size = sourceDict.getValueBytesFromId(idInSourceDict, newKeyBuf, bufOffset);
            int idInMergedDict = mergedDict.getIdFromValueBytes(newKeyBuf, bufOffset, size);
            BytesUtil.writeUnsigned(idInMergedDict, newKeyBuf, bufOffset, mergedDict.getSizeOfId());

            bufOffset += mergedDict.getSizeOfId();
        } else {
            // keep as it is
            while (splittedByteses[i + 1].length > newKeyBuf.length - bufOffset) {
                byte[] oldBuf = newKeyBuf;
                newKeyBuf = new byte[2 * newKeyBuf.length];
                System.arraycopy(oldBuf, 0, newKeyBuf, 0, oldBuf.length);
            }

            System.arraycopy(splittedByteses[i + 1].value, 0, newKeyBuf, bufOffset,
                    splittedByteses[i + 1].length);
            bufOffset += splittedByteses[i + 1].length;
        }
    }
    byte[] newKey = Arrays.copyOf(newKeyBuf, bufOffset);
    outputKey.set(newKey, 0, newKey.length);

    context.write(outputKey, value);
}

From source file:com.kylinolap.job.hadoop.cube.NDCuboidMapper.java

License:Apache License

@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    long cuboidId = rowKeySplitter.split(key.getBytes(), key.getLength());
    Cuboid parentCuboid = Cuboid.findById(cubeDesc, cuboidId);

    Collection<Long> myChildren = cuboidScheduler.getSpanningCuboid(cuboidId);

    // if still empty or null
    if (myChildren == null || myChildren.size() == 0) {
        context.getCounter(BatchConstants.MAPREDUCE_COUTNER_GROUP_NAME, "Skipped records").increment(1L);
        skipCounter++;/*  www.j av  a2s .  com*/
        if (skipCounter % BatchConstants.COUNTER_MAX == 0) {
            logger.info("Skipped " + skipCounter + " records!");
        }
        return;
    }

    context.getCounter(BatchConstants.MAPREDUCE_COUTNER_GROUP_NAME, "Processed records").increment(1L);

    handleCounter++;
    if (handleCounter % BatchConstants.COUNTER_MAX == 0) {
        logger.info("Handled " + handleCounter + " records!");
    }

    for (Long child : myChildren) {
        Cuboid childCuboid = Cuboid.findById(cubeDesc, child);
        int keyLength = buildKey(parentCuboid, childCuboid, rowKeySplitter.getSplitBuffers());
        outputKey.set(keyBuf, 0, keyLength);
        context.write(outputKey, value);
    }

}

From source file:com.kylinolap.job.hadoop.cube.NewBaseCuboidMapper.java

License:Apache License

@Override
public void map(KEYIN key, Text value, Context context) throws IOException, InterruptedException {
    // combining the hive table flattening logic into base cuboid building.
    // the input of this mapper is the fact table rows

    counter++;/*ww  w .  j  a v a 2s.  co m*/
    if (counter % BatchConstants.COUNTER_MAX == 0) {
        logger.info("Handled " + counter + " records!");
    }

    if (!byteRowDelimiterInferred)
        byteRowDelimiter = bytesSplitter.inferByteRowDelimiter(value.getBytes(), value.getLength(),
                factTableDesc.getColumns().length);

    bytesSplitter.split(value.getBytes(), value.getLength(), byteRowDelimiter);

    try {
        byte[] rowKey = buildKey(bytesSplitter.getSplitBuffers());
        if (rowKey == null)
            return;// skip this fact table row

        outputKey.set(rowKey, 0, rowKey.length);

        buildValue(bytesSplitter.getSplitBuffers());
        outputValue.set(valueBuf.array(), 0, valueBuf.position());

        context.write(outputKey, outputValue);

    } catch (Throwable t) {
        logger.error("", t);
        context.getCounter(BatchConstants.MAPREDUCE_COUTNER_GROUP_NAME, "Error records").increment(1L);
        return;
    }
}

From source file:com.kylinolap.job.hadoop.cube.NewBaseCuboidMapperTest.java

License:Apache License

@Test
@Ignore//from  w  ww  . j  av  a2 s.  co m
public void testMapperWithHeader() throws Exception {
    String cubeName = "test_kylin_cube_with_slr_ready";
    mapDriver.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
    // mapDriver.getConfiguration().set(BatchConstants.CFG_METADATA_URL,
    // metadata);
    mapDriver.withInput(new Text("key"), new Text("0,2013-05-05,Auction,80053,0,5,41.204172263562,0,10000638"));

    List<Pair<Text, Text>> result = mapDriver.run();

    CubeManager cubeMgr = CubeManager.getInstance(this.getTestConfig());
    CubeInstance cube = cubeMgr.getCube(cubeName);

    assertEquals(1, result.size());
    Text rowkey = result.get(0).getFirst();
    byte[] key = rowkey.getBytes();
    byte[] header = Bytes.head(key, 26);
    byte[] sellerId = Bytes.tail(header, 18);
    byte[] cuboidId = Bytes.head(header, 8);
    byte[] restKey = Bytes.tail(key, rowkey.getLength() - 26);

    RowKeyDecoder decoder = new RowKeyDecoder(cube.getFirstSegment());
    decoder.decode(key);
    assertEquals(
            "[10000638, 2013-05-05, Computers/Tablets & Networking, MonitorProjectors & Accs, Monitors, Auction, 0, 5]",
            decoder.getValues().toString());

    assertTrue(Bytes.toString(sellerId).startsWith("10000638"));
    assertEquals(255, Bytes.toLong(cuboidId));
    assertEquals(21, restKey.length);

    verifyMeasures(cube.getDescriptor().getMeasures(), result.get(0).getSecond(), "41.204172263562",
            "41.204172263562", "41.204172263562", 1);
}

From source file:com.kylinolap.job.hadoop.cube.RangeKeyDistributionReducer.java

License:Apache License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
    int nRegion = Math.round((float) gbPoints.size() / (float) cut);
    nRegion = Math.max(1, nRegion);
    nRegion = Math.min(MAX_REGION, nRegion);

    int gbPerRegion = gbPoints.size() / nRegion;
    gbPerRegion = Math.max(1, gbPerRegion);

    System.out.println(nRegion + " regions");
    System.out.println(gbPerRegion + " GB per region");

    for (int i = gbPerRegion; i < gbPoints.size(); i += gbPerRegion) {
        Text key = gbPoints.get(i);
        outputValue.set(i);// w w w. jav  a 2  s  .c om
        System.out.println(StringUtils.byteToHexString(key.getBytes()) + "\t" + outputValue.get());
        context.write(key, outputValue);
    }
}

From source file:com.kylinolap.job.hadoop.invertedindex.IIDistinctColumnsMapper.java

License:Apache License

@Override
public void map(KEYIN key, Text value, Context context) throws IOException, InterruptedException {
    if (delim == -1) {
        delim = splitter.detectDelim(value, columns.length);
    }//from   www . ja v  a  2 s . c  om

    int nParts = splitter.split(value.getBytes(), value.getLength(), (byte) delim);
    SplittedBytes[] parts = splitter.getSplitBuffers();

    if (nParts != columns.length) {
        throw new RuntimeException("Got " + parts.length + " from -- " + value.toString() + " -- but only "
                + columns.length + " expected");
    }

    for (short i = 0; i < nParts; i++) {
        outputKey.set(i);
        outputValue.set(parts[i].value, 0, parts[i].length);
        context.write(outputKey, outputValue);
    }
}

From source file:com.kylinolap.job.hadoop.invertedindex.IIDistinctColumnsReducer.java

License:Apache License

@Override
public void reduce(ShortWritable key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    String columnName = columns[key.get()];

    HashSet<ByteArray> set = new HashSet<ByteArray>();
    for (Text textValue : values) {
        ByteArray value = new ByteArray(Bytes.copy(textValue.getBytes(), 0, textValue.getLength()));
        set.add(value);//w  ww. j a v  a  2s.c o m
    }

    Configuration conf = context.getConfiguration();
    FileSystem fs = FileSystem.get(conf);
    String outputPath = conf.get(BatchConstants.OUTPUT_PATH);
    FSDataOutputStream out = fs.create(new Path(outputPath, columnName));

    try {
        for (ByteArray value : set) {
            out.write(value.data);
            out.write('\n');
        }
    } finally {
        out.close();
    }

}

From source file:com.kylinolap.job.hadoop.invertedindex.InvertedIndexMapper.java

License:Apache License

@Override
public void map(KEYIN key, Text value, Context context) throws IOException, InterruptedException {
    if (delim == -1) {
        delim = splitter.detectDelim(value, info.getColumnCount());
    }//  w  w w  . j  ava2 s .  c  om

    int nParts = splitter.split(value.getBytes(), value.getLength(), (byte) delim);
    SplittedBytes[] parts = splitter.getSplitBuffers();

    if (nParts != info.getColumnCount()) {
        throw new RuntimeException("Got " + parts.length + " from -- " + value.toString() + " -- but only "
                + info.getColumnCount() + " expected");
    }

    rec.reset();
    for (int i = 0; i < nParts; i++) {
        rec.setValueString(i, Bytes.toString(parts[i].value, 0, parts[i].length));
    }

    outputKey.set(rec.getTimestamp());
    // outputValue's backing bytes array is the same as rec

    context.write(outputKey, outputValue);
}