List of usage examples for org.apache.hadoop.io NullWritable get
public static NullWritable get()
From source file:org.apache.kylin.engine.mr.steps.CreateDictionaryJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_ID); options.addOption(OPTION_INPUT_PATH); parseOptions(options, args);//from w w w . ja v a 2s.c om final String cubeName = getOptionValue(OPTION_CUBE_NAME); final String segmentID = getOptionValue(OPTION_SEGMENT_ID); final String factColumnsInputPath = getOptionValue(OPTION_INPUT_PATH); final KylinConfig config = KylinConfig.getInstanceFromEnv(); DictionaryGeneratorCLI.processSegment(config, cubeName, segmentID, new DistinctColumnValuesProvider() { @Override public IReadableTable getDistinctValuesFor(TblColRef col) { return new SortedColumnDFSFile(factColumnsInputPath + "/" + col.getIdentity(), col.getType()); } }, new DictionaryProvider() { @Override public Dictionary<String> getDictionary(TblColRef col) throws IOException { Path colDir = new Path(factColumnsInputPath, col.getIdentity()); FileSystem fs = HadoopUtil.getWorkingFileSystem(); Path dictFile = HadoopUtil.getFilterOnlyPath(fs, colDir, col.getName() + FactDistinctColumnsReducer.DICT_FILE_POSTFIX); if (dictFile == null) { logger.info("Dict for '" + col.getName() + "' not pre-built."); return null; } try (SequenceFile.Reader reader = new SequenceFile.Reader(HadoopUtil.getCurrentConfiguration(), SequenceFile.Reader.file(dictFile))) { NullWritable key = NullWritable.get(); BytesWritable value = new BytesWritable(); reader.next(key, value); ByteBuffer buffer = new ByteArray(value.getBytes()).asBuffer(); try (DataInputStream is = new DataInputStream(new ByteBufferBackedInputStream(buffer))) { String dictClassName = is.readUTF(); Dictionary<String> dict = (Dictionary<String>) ClassUtil.newInstance(dictClassName); dict.readFields(is); logger.info("DictionaryProvider read dict from file: " + dictFile); return dict; } } } }); return 0; }
From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.java
License:Apache License
@Override public void doReduce(SelfDefineSortableKey skey, Iterable<Text> values, Context context) throws IOException, InterruptedException { Text key = skey.getText();// w w w. j av a 2 s .c o m if (isStatistics) { // for hll long cuboidId = Bytes.toLong(key.getBytes(), 1, Bytes.SIZEOF_LONG); for (Text value : values) { HLLCounter hll = new HLLCounter(cubeConfig.getCubeStatsHLLPrecision()); ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength()); hll.readRegisters(bf); totalRowsBeforeMerge += hll.getCountEstimate(); if (cuboidId == baseCuboidId) { baseCuboidRowCountInMappers.add(hll.getCountEstimate()); } if (cuboidHLLMap.get(cuboidId) != null) { cuboidHLLMap.get(cuboidId).merge(hll); } else { cuboidHLLMap.put(cuboidId, hll); } } } else if (isPartitionCol) { // partition col String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1); logAFewRows(value); long time = DateFormat.stringToMillis(value); timeMinValue = Math.min(timeMinValue, time); timeMaxValue = Math.max(timeMaxValue, time); } else { // normal col if (buildDictInReducer) { String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1); logAFewRows(value); builder.addValue(value); } else { byte[] keyBytes = Bytes.copy(key.getBytes(), 1, key.getLength() - 1); // output written to baseDir/colName/-r-00000 (etc) String fileName = col.getIdentity() + "/"; mos.write(BatchConstants.CFG_OUTPUT_COLUMN, NullWritable.get(), new Text(keyBytes), fileName); } } rowCount++; }
From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.java
License:Apache License
private void outputPartitionInfo() throws IOException, InterruptedException { if (col != null) { // output written to baseDir/colName/colName.pci-r-00000 (etc) String partitionFileName = col.getIdentity() + "/" + col.getName() + PARTITION_COL_INFO_FILE_POSTFIX; mos.write(BatchConstants.CFG_OUTPUT_PARTITION, NullWritable.get(), new LongWritable(timeMinValue), partitionFileName);/* www . j ava2s . c o m*/ mos.write(BatchConstants.CFG_OUTPUT_PARTITION, NullWritable.get(), new LongWritable(timeMaxValue), partitionFileName); logger.info("write partition info for col : " + col.getName() + " minValue:" + timeMinValue + " maxValue:" + timeMaxValue); } }
From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.java
License:Apache License
private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException { // output written to baseDir/colName/colName.rldict-r-00000 (etc) String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos);) { outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream);//from w w w .j a v a 2 s. c o m mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new BytesWritable(baos.toByteArray()), dictFileName); } }
From source file:org.apache.kylin.engine.mr.steps.UHCDictionaryMapper.java
License:Apache License
@Override public void doMap(NullWritable key, Text value, Context context) throws IOException, InterruptedException { tmpBuf.clear();// ww w. j a va 2s .c om int size = value.getLength() + 1; if (size >= tmpBuf.capacity()) { tmpBuf = ByteBuffer.allocate(countNewSize(tmpBuf.capacity(), size)); } tmpBuf.put(Bytes.toBytes(index)[3]); tmpBuf.put(value.getBytes(), 0, value.getLength()); outputKey.set(tmpBuf.array(), 0, tmpBuf.position()); sortableKey.init(outputKey, type); context.write(sortableKey, NullWritable.get()); }
From source file:org.apache.kylin.engine.mr.steps.UHCDictionaryReducer.java
License:Apache License
private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException { // output written to baseDir/colName/colName.rldict-r-00000 (etc) String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos);) { outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream);/*from ww w.jav a 2 s. c o m*/ mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName); } mos.close(); }
From source file:org.apache.kylin.storage.hbase.steps.CreateHTableJob.java
License:Apache License
protected static void saveHFileSplits(final List<HashMap<Long, Double>> innerRegionSplits, int mbPerRegion, final Path outputFolder, final KylinConfig kylinConfig) throws IOException { if (outputFolder == null) { logger.warn("outputFolder for hfile split file is null, skip inner region split"); return;/* ww w .j a v a 2 s . co m*/ } // note read-write separation, respect HBase FS here Configuration hbaseConf = HBaseConnection.getCurrentHBaseConfiguration(); FileSystem fs = FileSystem.get(hbaseConf); if (fs.exists(outputFolder) == false) { fs.mkdirs(outputFolder); } final float hfileSizeGB = kylinConfig.getHBaseHFileSizeGB(); float hfileSizeMB = hfileSizeGB * 1024; if (hfileSizeMB > mbPerRegion) { hfileSizeMB = mbPerRegion; } // keep the tweak for sandbox test if (hfileSizeMB > 0.0 && kylinConfig.isDevEnv()) { hfileSizeMB = mbPerRegion / 2; } int compactionThreshold = Integer.valueOf(hbaseConf.get("hbase.hstore.compactionThreshold", "3")); logger.info("hbase.hstore.compactionThreshold is " + compactionThreshold); if (hfileSizeMB > 0.0 && hfileSizeMB * compactionThreshold < mbPerRegion) { hfileSizeMB = mbPerRegion / compactionThreshold; } if (hfileSizeMB <= 0) { hfileSizeMB = mbPerRegion; } logger.info("hfileSizeMB:" + hfileSizeMB); final Path hfilePartitionFile = new Path(outputFolder, "part-r-00000_hfile"); short regionCount = (short) innerRegionSplits.size(); List<byte[]> splits = Lists.newArrayList(); for (int i = 0; i < regionCount; i++) { if (i > 0) { // skip 0 byte[] split = new byte[RowConstants.ROWKEY_SHARDID_LEN]; BytesUtil.writeUnsigned(i, split, 0, RowConstants.ROWKEY_SHARDID_LEN); splits.add(split); // split by region; } HashMap<Long, Double> cuboidSize = innerRegionSplits.get(i); List<Long> allCuboids = Lists.newArrayList(); allCuboids.addAll(cuboidSize.keySet()); Collections.sort(allCuboids); double accumulatedSize = 0; int j = 0; for (Long cuboid : allCuboids) { if (accumulatedSize >= hfileSizeMB) { logger.info(String.format("Region %d's hfile %d size is %.2f mb", i, j, accumulatedSize)); byte[] split = new byte[RowConstants.ROWKEY_SHARD_AND_CUBOID_LEN]; BytesUtil.writeUnsigned(i, split, 0, RowConstants.ROWKEY_SHARDID_LEN); System.arraycopy(Bytes.toBytes(cuboid), 0, split, RowConstants.ROWKEY_SHARDID_LEN, RowConstants.ROWKEY_CUBOIDID_LEN); splits.add(split); accumulatedSize = 0; j++; } accumulatedSize += cuboidSize.get(cuboid); } } SequenceFile.Writer hfilePartitionWriter = SequenceFile.createWriter(hbaseConf, SequenceFile.Writer.file(hfilePartitionFile), SequenceFile.Writer.keyClass(ImmutableBytesWritable.class), SequenceFile.Writer.valueClass(NullWritable.class)); for (int i = 0; i < splits.size(); i++) { hfilePartitionWriter.append(new ImmutableBytesWritable(splits.get(i)), NullWritable.get()); } hfilePartitionWriter.close(); }
From source file:org.apache.kylin.storage.hbase.steps.RangeKeyDistributionReducer.java
License:Apache License
@Override protected void doCleanup(Context context) throws IOException, InterruptedException { int nRegion = Math.round((float) gbPoints.size() / cut); nRegion = Math.max(minRegionCount, nRegion); nRegion = Math.min(maxRegionCount, nRegion); int gbPerRegion = gbPoints.size() / nRegion; gbPerRegion = Math.max(1, gbPerRegion); if (hfileSizeGB <= 0) { hfileSizeGB = gbPerRegion;//ww w . ja va 2 s . c om } int hfilePerRegion = (int) (gbPerRegion / hfileSizeGB); hfilePerRegion = Math.max(1, hfilePerRegion); System.out.println(nRegion + " regions"); System.out.println(gbPerRegion + " GB per region"); System.out.println(hfilePerRegion + " hfile per region"); Path hfilePartitionFile = new Path(output + "/part-r-00000_hfile"); SequenceFile.Writer hfilePartitionWriter = new SequenceFile.Writer( hfilePartitionFile.getFileSystem(context.getConfiguration()), context.getConfiguration(), hfilePartitionFile, ImmutableBytesWritable.class, NullWritable.class); int hfileCountInOneRegion = 0; for (int i = hfileSizeGB; i < gbPoints.size(); i += hfileSizeGB) { hfilePartitionWriter.append(new ImmutableBytesWritable(gbPoints.get(i).getBytes()), NullWritable.get()); if (++hfileCountInOneRegion >= hfilePerRegion) { Text key = gbPoints.get(i); outputValue.set(i); System.out.println(StringUtils.byteToHexString(key.getBytes()) + "\t" + outputValue.get()); context.write(key, outputValue); hfileCountInOneRegion = 0; } } hfilePartitionWriter.close(); }
From source file:org.apache.kylin.storage.hbase.steps.SparkCubeHFile.java
License:Apache License
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); final String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); final Path partitionFilePath = new Path(optionsHelper.getOptionValue(OPTION_PARTITION_FILE_PATH)); final String hbaseConfFile = optionsHelper.getOptionValue(AbstractHadoopJob.OPTION_HBASE_CONF_PATH); final String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"), KeyValueCreator.class, KeyValue.class, RowKeyWritable.class }; SparkConf conf = new SparkConf().setAppName("Converting HFile for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration()); if (!fs.exists(partitionFilePath)) { throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString()); }/*from w w w . ja va 2s.c o m*/ HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = cubeInstance.getDescriptor(); final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); final MeasureCodec inputCodec = new MeasureCodec(cubeDesc.getMeasures()); final List<KeyValueCreator> keyValueCreators = Lists.newArrayList(); for (HBaseColumnFamilyDesc cfDesc : cubeDesc.getHbaseMapping().getColumnFamily()) { for (HBaseColumnDesc colDesc : cfDesc.getColumns()) { keyValueCreators.add(new KeyValueCreator(cubeDesc, colDesc)); } } final int cfNum = keyValueCreators.size(); final boolean quickPath = (keyValueCreators.size() == 1) && keyValueCreators.get(0).isFullCopy; logger.info("Input path: {}", inputPath); logger.info("Output path: {}", outputPath); // read partition split keys List<RowKeyWritable> keys = new ArrayList<>(); try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, partitionFilePath, sc.hadoopConfiguration())) { RowKeyWritable key = new RowKeyWritable(); Writable value = NullWritable.get(); while (reader.next(key, value)) { keys.add(key); logger.info(" ------- split key: {}", key); key = new RowKeyWritable(); // important, new an object! } } logger.info("There are {} split keys, totally {} hfiles", keys.size(), (keys.size() + 1)); //HBase conf logger.info("Loading HBase configuration from:{}", hbaseConfFile); final Path hbaseConfFilePath = new Path(hbaseConfFile); final FileSystem hbaseClusterFs = hbaseConfFilePath.getFileSystem(sc.hadoopConfiguration()); try (FSDataInputStream confInput = hbaseClusterFs.open(new Path(hbaseConfFile))) { Configuration hbaseJobConf = new Configuration(); hbaseJobConf.addResource(confInput); hbaseJobConf.set("spark.hadoop.dfs.replication", "3"); // HFile, replication=3 Job job = Job.getInstance(hbaseJobConf, cubeSegment.getStorageLocationIdentifier()); FileOutputFormat.setOutputPath(job, new Path(outputPath)); // inputPath has the same FileSystem as hbaseClusterFs when in HBase standalone mode JavaPairRDD<Text, Text> inputRDDs = SparkUtil.parseInputPath(inputPath, hbaseClusterFs, sc, Text.class, Text.class); final JavaPairRDD<RowKeyWritable, KeyValue> hfilerdd; if (quickPath) { hfilerdd = inputRDDs .mapToPair(new PairFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() { @Override public Tuple2<RowKeyWritable, KeyValue> call(Tuple2<Text, Text> textTextTuple2) throws Exception { KeyValue outputValue = keyValueCreators.get(0).create(textTextTuple2._1, textTextTuple2._2.getBytes(), 0, textTextTuple2._2.getLength()); return new Tuple2<>( new RowKeyWritable(outputValue.createKeyOnly(false).getKey()), outputValue); } }); } else { hfilerdd = inputRDDs .flatMapToPair(new PairFlatMapFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() { @Override public Iterator<Tuple2<RowKeyWritable, KeyValue>> call( Tuple2<Text, Text> textTextTuple2) throws Exception { List<Tuple2<RowKeyWritable, KeyValue>> result = Lists .newArrayListWithExpectedSize(cfNum); Object[] inputMeasures = new Object[cubeDesc.getMeasures().size()]; inputCodec.decode(ByteBuffer.wrap(textTextTuple2._2.getBytes(), 0, textTextTuple2._2.getLength()), inputMeasures); for (int i = 0; i < cfNum; i++) { KeyValue outputValue = keyValueCreators.get(i).create(textTextTuple2._1, inputMeasures); result.add(new Tuple2<>( new RowKeyWritable(outputValue.createKeyOnly(false).getKey()), outputValue)); } return result.iterator(); } }); } hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys), RowKeyWritable.RowKeyComparator.INSTANCE) .mapToPair( new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() { @Override public Tuple2<ImmutableBytesWritable, KeyValue> call( Tuple2<RowKeyWritable, KeyValue> rowKeyWritableKeyValueTuple2) throws Exception { return new Tuple2<>( new ImmutableBytesWritable( rowKeyWritableKeyValueTuple2._2.getKey()), rowKeyWritableKeyValueTuple2._2); } }) .saveAsNewAPIHadoopDataset(job.getConfiguration()); } logger.info("HDFS: Number of bytes written={}", jobListener.metrics.getBytesWritten()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.HDFS_BYTES_WRITTEN, String.valueOf(jobListener.metrics.getBytesWritten())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); } }
From source file:org.apache.mahout.cf.taste.hadoop.pseudo.UserIDsMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); int comma = line.indexOf(','); long userID = comma >= 0 ? Long.parseLong(line.substring(0, comma)) : Long.parseLong(line); context.write(new VarLongWritable(userID), NullWritable.get()); }