Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:org.apache.kudu.mapreduce.tools.ImportCsvMapper.java

License:Apache License

/**
 * Convert a line of CSV text into a Kudu Insert
 *//*from  ww w  .  j a v a2 s  .c  o m*/
@Override
public void map(LongWritable offset, Text value, Context context) throws IOException {
    byte[] lineBytes = value.getBytes();

    try {
        CsvParser.ParsedLine parsed = this.parser.parse(lineBytes, value.getLength());

        Insert insert = this.table.newInsert();
        PartialRow row = insert.getRow();
        for (int i = 0; i < parsed.getColumnCount(); i++) {
            String colName = parsed.getColumnName(i);
            ColumnSchema col = this.schema.getColumn(colName);
            String colValue = Bytes.getString(parsed.getLineBytes(), parsed.getColumnOffset(i),
                    parsed.getColumnLength(i));
            switch (col.getType()) {
            case BOOL:
                row.addBoolean(colName, Boolean.parseBoolean(colValue));
                break;
            case INT8:
                row.addByte(colName, Byte.parseByte(colValue));
                break;
            case INT16:
                row.addShort(colName, Short.parseShort(colValue));
                break;
            case INT32:
                row.addInt(colName, Integer.parseInt(colValue));
                break;
            case INT64:
                row.addLong(colName, Long.parseLong(colValue));
                break;
            case STRING:
                row.addString(colName, colValue);
                break;
            case FLOAT:
                row.addFloat(colName, Float.parseFloat(colValue));
                break;
            case DOUBLE:
                row.addDouble(colName, Double.parseDouble(colValue));
                break;
            default:
                throw new IllegalArgumentException("Type " + col.getType() + " not recognized");
            }
        }
        context.write(NULL_KEY, insert);
    } catch (CsvParser.BadCsvLineException badLine) {
        if (this.skipBadLines) {
            System.err.println("Bad line at offset: " + offset.get() + ":\n" + badLine.getMessage());
            this.badLineCount.increment(1);
            return;
        } else {
            throw new IOException("Failing task because of a bad line", badLine);
        }
    } catch (IllegalArgumentException e) {
        if (this.skipBadLines) {
            System.err.println("Bad line at offset: " + offset.get() + ":\n" + e.getMessage());
            this.badLineCount.increment(1);
            return;
        } else {
            throw new IOException("Failing task because of an illegal argument", e);
        }
    } catch (InterruptedException e) {
        throw new IOException("Failing task since it was interrupted", e);
    }
}

From source file:org.apache.kylin.engine.mr.steps.CalculateStatsFromBaseCuboidReducer.java

License:Apache License

@Override
public void doReduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    long cuboidId = Bytes.toLong(key.getBytes());
    logger.info("Cuboid id to be processed: " + cuboidId);
    for (Text value : values) {
        HLLCounter hll = new HLLCounter(cubeConfig.getCubeStatsHLLPrecision());
        ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength());
        hll.readRegisters(bf);//from   w  w w  .j  a v  a2 s  . com

        if (cuboidId == baseCuboidId) {
            baseCuboidRowCountInMappers.add(hll.getCountEstimate());
        }

        totalRowsBeforeMerge += hll.getCountEstimate();

        if (cuboidHLLMap.get(cuboidId) != null) {
            cuboidHLLMap.get(cuboidId).merge(hll);
        } else {
            cuboidHLLMap.put(cuboidId, hll);
        }
    }
}

From source file:org.apache.kylin.engine.mr.steps.CuboidReducer.java

License:Apache License

@Override
public void doReduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    aggs.reset();/* w w  w.  j a  va  2s  . com*/

    for (Text value : values) {
        if (vcounter++ % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) {
            logger.info("Handling value with ordinal (This is not KV number!): " + vcounter);
        }
        codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), input);
        aggs.aggregate(input, needAggrMeasures);
    }
    aggs.collectStates(result);

    ByteBuffer valueBuf = codec.encode(result);

    outputValue.set(valueBuf.array(), 0, valueBuf.position());
    context.write(key, outputValue);
}

From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.java

License:Apache License

@Override
public void doReduce(SelfDefineSortableKey skey, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    Text key = skey.getText();//from   w ww  .  j  av a 2s .c  om
    if (isStatistics) {
        // for hll
        long cuboidId = Bytes.toLong(key.getBytes(), 1, Bytes.SIZEOF_LONG);
        for (Text value : values) {
            HLLCounter hll = new HLLCounter(cubeConfig.getCubeStatsHLLPrecision());
            ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength());
            hll.readRegisters(bf);

            totalRowsBeforeMerge += hll.getCountEstimate();

            if (cuboidId == baseCuboidId) {
                baseCuboidRowCountInMappers.add(hll.getCountEstimate());
            }

            if (cuboidHLLMap.get(cuboidId) != null) {
                cuboidHLLMap.get(cuboidId).merge(hll);
            } else {
                cuboidHLLMap.put(cuboidId, hll);
            }
        }
    } else if (isPartitionCol) {
        // partition col
        String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1);
        logAFewRows(value);
        long time = DateFormat.stringToMillis(value);
        timeMinValue = Math.min(timeMinValue, time);
        timeMaxValue = Math.max(timeMaxValue, time);
    } else {
        // normal col
        if (buildDictInReducer) {
            String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1);
            logAFewRows(value);
            builder.addValue(value);
        } else {
            byte[] keyBytes = Bytes.copy(key.getBytes(), 1, key.getLength() - 1);
            // output written to baseDir/colName/-r-00000 (etc)
            String fileName = col.getIdentity() + "/";
            mos.write(BatchConstants.CFG_OUTPUT_COLUMN, NullWritable.get(), new Text(keyBytes), fileName);
        }
    }

    rowCount++;
}

From source file:org.apache.kylin.engine.mr.steps.MergeCuboidMapper.java

License:Apache License

@Override
public void doMap(Text key, Text value, Context context) throws IOException, InterruptedException {
    long cuboidID = rowKeySplitter.split(key.getBytes());
    Cuboid cuboid = Cuboid.findById(cubeDesc, cuboidID);
    RowKeyEncoder rowkeyEncoder = rowKeyEncoderProvider.getRowkeyEncoder(cuboid);

    SplittedBytes[] splittedByteses = rowKeySplitter.getSplitBuffers();
    int bufOffset = 0;
    int bodySplitOffset = rowKeySplitter.getBodySplitOffset();

    for (int i = 0; i < cuboid.getColumns().size(); ++i) {
        int useSplit = i + bodySplitOffset;
        TblColRef col = cuboid.getColumns().get(i);

        if (this.checkNeedMerging(col)) {
            // if dictionary on fact table column, needs rewrite
            DictionaryManager dictMgr = DictionaryManager.getInstance(config);
            Dictionary<String> mergedDict = dictMgr.getDictionary(mergedCubeSegment.getDictResPath(col));

            Dictionary<String> sourceDict;
            // handle the column that all records is null
            if (sourceCubeSegment.getDictionary(col) == null) {
                BytesUtil.writeUnsigned(mergedDict.nullId(), newKeyBodyBuf, bufOffset,
                        mergedDict.getSizeOfId());
                bufOffset += mergedDict.getSizeOfId();
                continue;
            } else {
                sourceDict = dictMgr.getDictionary(sourceCubeSegment.getDictResPath(col));
            }//from w  w  w.  j a  va 2s.  c o  m

            while (sourceDict.getSizeOfValue() > newKeyBodyBuf.length - bufOffset || //
                    mergedDict.getSizeOfValue() > newKeyBodyBuf.length - bufOffset || //
                    mergedDict.getSizeOfId() > newKeyBodyBuf.length - bufOffset) {
                byte[] oldBuf = newKeyBodyBuf;
                newKeyBodyBuf = new byte[2 * newKeyBodyBuf.length];
                System.arraycopy(oldBuf, 0, newKeyBodyBuf, 0, oldBuf.length);
            }

            int idInSourceDict = BytesUtil.readUnsigned(splittedByteses[useSplit].value, 0,
                    splittedByteses[useSplit].length);
            int idInMergedDict;

            //int size = sourceDict.getValueBytesFromId(idInSourceDict, newKeyBodyBuf, bufOffset);
            String v = sourceDict.getValueFromId(idInSourceDict);
            if (v == null) {
                idInMergedDict = mergedDict.nullId();
            } else {
                idInMergedDict = mergedDict.getIdFromValue(v);
            }

            BytesUtil.writeUnsigned(idInMergedDict, newKeyBodyBuf, bufOffset, mergedDict.getSizeOfId());
            bufOffset += mergedDict.getSizeOfId();
        } else {
            // keep as it is
            while (splittedByteses[useSplit].length > newKeyBodyBuf.length - bufOffset) {
                byte[] oldBuf = newKeyBodyBuf;
                newKeyBodyBuf = new byte[2 * newKeyBodyBuf.length];
                System.arraycopy(oldBuf, 0, newKeyBodyBuf, 0, oldBuf.length);
            }

            System.arraycopy(splittedByteses[useSplit].value, 0, newKeyBodyBuf, bufOffset,
                    splittedByteses[useSplit].length);
            bufOffset += splittedByteses[useSplit].length;
        }
    }

    int fullKeySize = rowkeyEncoder.getBytesLength();
    while (newKeyBuf.array().length < fullKeySize) {
        newKeyBuf.set(new byte[newKeyBuf.length() * 2]);
    }
    newKeyBuf.set(0, fullKeySize);

    rowkeyEncoder.encode(new ByteArray(newKeyBodyBuf, 0, bufOffset), newKeyBuf);
    outputKey.set(newKeyBuf.array(), 0, fullKeySize);

    // re-encode measures if dictionary is used
    if (dictMeasures.size() > 0) {
        codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs);
        for (Pair<Integer, MeasureIngester> pair : dictMeasures) {
            int i = pair.getFirst();
            MeasureIngester ingester = pair.getSecond();
            measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts,
                    newDicts);
        }
        ByteBuffer valueBuf = codec.encode(measureObjs);
        outputValue.set(valueBuf.array(), 0, valueBuf.position());
        value = outputValue;
    }

    context.write(outputKey, value);
}

From source file:org.apache.kylin.engine.mr.steps.NumberDictionaryForestTest.java

License:Apache License

private String printKey(SelfDefineSortableKey key) {
    Text data = key.getText();
    String fieldValue = Bytes.toString(data.getBytes(), 1, data.getLength() - 1);
    System.out.println("type flag:" + key.getTypeId() + " fieldValue:" + fieldValue);
    return fieldValue;
}

From source file:org.apache.kylin.engine.mr.steps.NumberDictionaryForestTest.java

License:Apache License

private String getFieldValue(SelfDefineSortableKey key) {
    Text data = key.getText();
    return Bytes.toString(data.getBytes(), 1, data.getLength() - 1);
}

From source file:org.apache.kylin.engine.mr.steps.RowKeyDistributionCheckerMapper.java

License:Apache License

@Override
public void doMap(Text key, Text value, Context context) throws IOException, InterruptedException {
    for (Text t : keyList) {
        if (key.compareTo(t) < 0) {
            Long v = resultMap.get(t);
            long length = (long) key.getLength() + value.getLength();
            v += length;/*from   w  w w  . j  av  a2  s. c o  m*/
            resultMap.put(t, v);
            break;
        }
    }
}

From source file:org.apache.kylin.engine.mr.steps.SegmentReEncoder.java

License:Apache License

/**
 * Re-encode with both dimension and measure in encoded (Text) format.
 * @param key//from  www.  j  av a2s. co m
 * @param value
 * @return
 * @throws IOException
 */
public Pair<Text, Text> reEncode(Text key, Text value) throws IOException {
    if (initialized == false) {
        throw new IllegalStateException("Not initialized");
    }
    Object[] measureObjs = new Object[measureDescs.size()];
    // re-encode measures if dictionary is used
    if (dictMeasures.size() > 0) {
        codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs);
        for (Pair<Integer, MeasureIngester> pair : dictMeasures) {
            int i = pair.getFirst();
            MeasureIngester ingester = pair.getSecond();
            measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts,
                    newDicts);
        }

        ByteBuffer valueBuf = codec.encode(measureObjs);
        byte[] resultValue = new byte[valueBuf.position()];
        System.arraycopy(valueBuf.array(), 0, resultValue, 0, valueBuf.position());

        return Pair.newPair(processKey(key), new Text(resultValue));
    } else {
        return Pair.newPair(processKey(key), value);
    }
}

From source file:org.apache.kylin.engine.mr.steps.SegmentReEncoder.java

License:Apache License

/**
 * Re-encode with measures in Object[] format.
 * @param key// w  w  w  .  j av  a 2s .  com
 * @param value
 * @return
 * @throws IOException
 */
public Pair<Text, Object[]> reEncode2(Text key, Text value) throws IOException {
    if (initialized == false) {
        throw new IllegalStateException("Not initialized");
    }

    Object[] measureObjs = new Object[measureDescs.size()];
    codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs);
    // re-encode measures if dictionary is used
    if (dictMeasures.size() > 0) {
        for (Pair<Integer, MeasureIngester> pair : dictMeasures) {
            int i = pair.getFirst();
            MeasureIngester ingester = pair.getSecond();
            measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts,
                    newDicts);
        }

        ByteBuffer valueBuf = codec.encode(measureObjs);
        byte[] resultValue = new byte[valueBuf.position()];
        System.arraycopy(valueBuf.array(), 0, resultValue, 0, valueBuf.position());

    }
    return Pair.newPair(processKey(key), measureObjs);
}