List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:org.apache.kudu.mapreduce.tools.ImportCsvMapper.java
License:Apache License
/** * Convert a line of CSV text into a Kudu Insert *//*from ww w . j a v a2 s .c o m*/ @Override public void map(LongWritable offset, Text value, Context context) throws IOException { byte[] lineBytes = value.getBytes(); try { CsvParser.ParsedLine parsed = this.parser.parse(lineBytes, value.getLength()); Insert insert = this.table.newInsert(); PartialRow row = insert.getRow(); for (int i = 0; i < parsed.getColumnCount(); i++) { String colName = parsed.getColumnName(i); ColumnSchema col = this.schema.getColumn(colName); String colValue = Bytes.getString(parsed.getLineBytes(), parsed.getColumnOffset(i), parsed.getColumnLength(i)); switch (col.getType()) { case BOOL: row.addBoolean(colName, Boolean.parseBoolean(colValue)); break; case INT8: row.addByte(colName, Byte.parseByte(colValue)); break; case INT16: row.addShort(colName, Short.parseShort(colValue)); break; case INT32: row.addInt(colName, Integer.parseInt(colValue)); break; case INT64: row.addLong(colName, Long.parseLong(colValue)); break; case STRING: row.addString(colName, colValue); break; case FLOAT: row.addFloat(colName, Float.parseFloat(colValue)); break; case DOUBLE: row.addDouble(colName, Double.parseDouble(colValue)); break; default: throw new IllegalArgumentException("Type " + col.getType() + " not recognized"); } } context.write(NULL_KEY, insert); } catch (CsvParser.BadCsvLineException badLine) { if (this.skipBadLines) { System.err.println("Bad line at offset: " + offset.get() + ":\n" + badLine.getMessage()); this.badLineCount.increment(1); return; } else { throw new IOException("Failing task because of a bad line", badLine); } } catch (IllegalArgumentException e) { if (this.skipBadLines) { System.err.println("Bad line at offset: " + offset.get() + ":\n" + e.getMessage()); this.badLineCount.increment(1); return; } else { throw new IOException("Failing task because of an illegal argument", e); } } catch (InterruptedException e) { throw new IOException("Failing task since it was interrupted", e); } }
From source file:org.apache.kylin.engine.mr.steps.CalculateStatsFromBaseCuboidReducer.java
License:Apache License
@Override public void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { long cuboidId = Bytes.toLong(key.getBytes()); logger.info("Cuboid id to be processed: " + cuboidId); for (Text value : values) { HLLCounter hll = new HLLCounter(cubeConfig.getCubeStatsHLLPrecision()); ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength()); hll.readRegisters(bf);//from w w w .j a v a2 s . com if (cuboidId == baseCuboidId) { baseCuboidRowCountInMappers.add(hll.getCountEstimate()); } totalRowsBeforeMerge += hll.getCountEstimate(); if (cuboidHLLMap.get(cuboidId) != null) { cuboidHLLMap.get(cuboidId).merge(hll); } else { cuboidHLLMap.put(cuboidId, hll); } } }
From source file:org.apache.kylin.engine.mr.steps.CuboidReducer.java
License:Apache License
@Override public void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { aggs.reset();/* w w w. j a va 2s . com*/ for (Text value : values) { if (vcounter++ % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) { logger.info("Handling value with ordinal (This is not KV number!): " + vcounter); } codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), input); aggs.aggregate(input, needAggrMeasures); } aggs.collectStates(result); ByteBuffer valueBuf = codec.encode(result); outputValue.set(valueBuf.array(), 0, valueBuf.position()); context.write(key, outputValue); }
From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.java
License:Apache License
@Override public void doReduce(SelfDefineSortableKey skey, Iterable<Text> values, Context context) throws IOException, InterruptedException { Text key = skey.getText();//from w ww . j av a 2s .c om if (isStatistics) { // for hll long cuboidId = Bytes.toLong(key.getBytes(), 1, Bytes.SIZEOF_LONG); for (Text value : values) { HLLCounter hll = new HLLCounter(cubeConfig.getCubeStatsHLLPrecision()); ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength()); hll.readRegisters(bf); totalRowsBeforeMerge += hll.getCountEstimate(); if (cuboidId == baseCuboidId) { baseCuboidRowCountInMappers.add(hll.getCountEstimate()); } if (cuboidHLLMap.get(cuboidId) != null) { cuboidHLLMap.get(cuboidId).merge(hll); } else { cuboidHLLMap.put(cuboidId, hll); } } } else if (isPartitionCol) { // partition col String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1); logAFewRows(value); long time = DateFormat.stringToMillis(value); timeMinValue = Math.min(timeMinValue, time); timeMaxValue = Math.max(timeMaxValue, time); } else { // normal col if (buildDictInReducer) { String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1); logAFewRows(value); builder.addValue(value); } else { byte[] keyBytes = Bytes.copy(key.getBytes(), 1, key.getLength() - 1); // output written to baseDir/colName/-r-00000 (etc) String fileName = col.getIdentity() + "/"; mos.write(BatchConstants.CFG_OUTPUT_COLUMN, NullWritable.get(), new Text(keyBytes), fileName); } } rowCount++; }
From source file:org.apache.kylin.engine.mr.steps.MergeCuboidMapper.java
License:Apache License
@Override public void doMap(Text key, Text value, Context context) throws IOException, InterruptedException { long cuboidID = rowKeySplitter.split(key.getBytes()); Cuboid cuboid = Cuboid.findById(cubeDesc, cuboidID); RowKeyEncoder rowkeyEncoder = rowKeyEncoderProvider.getRowkeyEncoder(cuboid); SplittedBytes[] splittedByteses = rowKeySplitter.getSplitBuffers(); int bufOffset = 0; int bodySplitOffset = rowKeySplitter.getBodySplitOffset(); for (int i = 0; i < cuboid.getColumns().size(); ++i) { int useSplit = i + bodySplitOffset; TblColRef col = cuboid.getColumns().get(i); if (this.checkNeedMerging(col)) { // if dictionary on fact table column, needs rewrite DictionaryManager dictMgr = DictionaryManager.getInstance(config); Dictionary<String> mergedDict = dictMgr.getDictionary(mergedCubeSegment.getDictResPath(col)); Dictionary<String> sourceDict; // handle the column that all records is null if (sourceCubeSegment.getDictionary(col) == null) { BytesUtil.writeUnsigned(mergedDict.nullId(), newKeyBodyBuf, bufOffset, mergedDict.getSizeOfId()); bufOffset += mergedDict.getSizeOfId(); continue; } else { sourceDict = dictMgr.getDictionary(sourceCubeSegment.getDictResPath(col)); }//from w w w. j a va 2s. c o m while (sourceDict.getSizeOfValue() > newKeyBodyBuf.length - bufOffset || // mergedDict.getSizeOfValue() > newKeyBodyBuf.length - bufOffset || // mergedDict.getSizeOfId() > newKeyBodyBuf.length - bufOffset) { byte[] oldBuf = newKeyBodyBuf; newKeyBodyBuf = new byte[2 * newKeyBodyBuf.length]; System.arraycopy(oldBuf, 0, newKeyBodyBuf, 0, oldBuf.length); } int idInSourceDict = BytesUtil.readUnsigned(splittedByteses[useSplit].value, 0, splittedByteses[useSplit].length); int idInMergedDict; //int size = sourceDict.getValueBytesFromId(idInSourceDict, newKeyBodyBuf, bufOffset); String v = sourceDict.getValueFromId(idInSourceDict); if (v == null) { idInMergedDict = mergedDict.nullId(); } else { idInMergedDict = mergedDict.getIdFromValue(v); } BytesUtil.writeUnsigned(idInMergedDict, newKeyBodyBuf, bufOffset, mergedDict.getSizeOfId()); bufOffset += mergedDict.getSizeOfId(); } else { // keep as it is while (splittedByteses[useSplit].length > newKeyBodyBuf.length - bufOffset) { byte[] oldBuf = newKeyBodyBuf; newKeyBodyBuf = new byte[2 * newKeyBodyBuf.length]; System.arraycopy(oldBuf, 0, newKeyBodyBuf, 0, oldBuf.length); } System.arraycopy(splittedByteses[useSplit].value, 0, newKeyBodyBuf, bufOffset, splittedByteses[useSplit].length); bufOffset += splittedByteses[useSplit].length; } } int fullKeySize = rowkeyEncoder.getBytesLength(); while (newKeyBuf.array().length < fullKeySize) { newKeyBuf.set(new byte[newKeyBuf.length() * 2]); } newKeyBuf.set(0, fullKeySize); rowkeyEncoder.encode(new ByteArray(newKeyBodyBuf, 0, bufOffset), newKeyBuf); outputKey.set(newKeyBuf.array(), 0, fullKeySize); // re-encode measures if dictionary is used if (dictMeasures.size() > 0) { codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs); for (Pair<Integer, MeasureIngester> pair : dictMeasures) { int i = pair.getFirst(); MeasureIngester ingester = pair.getSecond(); measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts, newDicts); } ByteBuffer valueBuf = codec.encode(measureObjs); outputValue.set(valueBuf.array(), 0, valueBuf.position()); value = outputValue; } context.write(outputKey, value); }
From source file:org.apache.kylin.engine.mr.steps.NumberDictionaryForestTest.java
License:Apache License
private String printKey(SelfDefineSortableKey key) { Text data = key.getText(); String fieldValue = Bytes.toString(data.getBytes(), 1, data.getLength() - 1); System.out.println("type flag:" + key.getTypeId() + " fieldValue:" + fieldValue); return fieldValue; }
From source file:org.apache.kylin.engine.mr.steps.NumberDictionaryForestTest.java
License:Apache License
private String getFieldValue(SelfDefineSortableKey key) { Text data = key.getText(); return Bytes.toString(data.getBytes(), 1, data.getLength() - 1); }
From source file:org.apache.kylin.engine.mr.steps.RowKeyDistributionCheckerMapper.java
License:Apache License
@Override public void doMap(Text key, Text value, Context context) throws IOException, InterruptedException { for (Text t : keyList) { if (key.compareTo(t) < 0) { Long v = resultMap.get(t); long length = (long) key.getLength() + value.getLength(); v += length;/*from w w w . j av a2 s. c o m*/ resultMap.put(t, v); break; } } }
From source file:org.apache.kylin.engine.mr.steps.SegmentReEncoder.java
License:Apache License
/** * Re-encode with both dimension and measure in encoded (Text) format. * @param key//from www. j av a2s. co m * @param value * @return * @throws IOException */ public Pair<Text, Text> reEncode(Text key, Text value) throws IOException { if (initialized == false) { throw new IllegalStateException("Not initialized"); } Object[] measureObjs = new Object[measureDescs.size()]; // re-encode measures if dictionary is used if (dictMeasures.size() > 0) { codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs); for (Pair<Integer, MeasureIngester> pair : dictMeasures) { int i = pair.getFirst(); MeasureIngester ingester = pair.getSecond(); measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts, newDicts); } ByteBuffer valueBuf = codec.encode(measureObjs); byte[] resultValue = new byte[valueBuf.position()]; System.arraycopy(valueBuf.array(), 0, resultValue, 0, valueBuf.position()); return Pair.newPair(processKey(key), new Text(resultValue)); } else { return Pair.newPair(processKey(key), value); } }
From source file:org.apache.kylin.engine.mr.steps.SegmentReEncoder.java
License:Apache License
/** * Re-encode with measures in Object[] format. * @param key// w w w . j av a 2s . com * @param value * @return * @throws IOException */ public Pair<Text, Object[]> reEncode2(Text key, Text value) throws IOException { if (initialized == false) { throw new IllegalStateException("Not initialized"); } Object[] measureObjs = new Object[measureDescs.size()]; codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs); // re-encode measures if dictionary is used if (dictMeasures.size() > 0) { for (Pair<Integer, MeasureIngester> pair : dictMeasures) { int i = pair.getFirst(); MeasureIngester ingester = pair.getSecond(); measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts, newDicts); } ByteBuffer valueBuf = codec.encode(measureObjs); byte[] resultValue = new byte[valueBuf.position()]; System.arraycopy(valueBuf.array(), 0, resultValue, 0, valueBuf.position()); } return Pair.newPair(processKey(key), measureObjs); }