Example usage for org.apache.hadoop.io NullWritable get

List of usage examples for org.apache.hadoop.io NullWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io NullWritable get.

Prototype

public static NullWritable get() 

Source Link

Document

Returns the single instance of this class.

Usage

From source file:org.apache.kylin.engine.mr.steps.CreateDictionaryJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OPTION_CUBE_NAME);
    options.addOption(OPTION_SEGMENT_ID);
    options.addOption(OPTION_INPUT_PATH);
    parseOptions(options, args);//from   w  w w  . ja  v a  2s.c  om

    final String cubeName = getOptionValue(OPTION_CUBE_NAME);
    final String segmentID = getOptionValue(OPTION_SEGMENT_ID);
    final String factColumnsInputPath = getOptionValue(OPTION_INPUT_PATH);

    final KylinConfig config = KylinConfig.getInstanceFromEnv();

    DictionaryGeneratorCLI.processSegment(config, cubeName, segmentID, new DistinctColumnValuesProvider() {
        @Override
        public IReadableTable getDistinctValuesFor(TblColRef col) {
            return new SortedColumnDFSFile(factColumnsInputPath + "/" + col.getIdentity(), col.getType());
        }
    }, new DictionaryProvider() {

        @Override
        public Dictionary<String> getDictionary(TblColRef col) throws IOException {
            Path colDir = new Path(factColumnsInputPath, col.getIdentity());
            FileSystem fs = HadoopUtil.getWorkingFileSystem();

            Path dictFile = HadoopUtil.getFilterOnlyPath(fs, colDir,
                    col.getName() + FactDistinctColumnsReducer.DICT_FILE_POSTFIX);
            if (dictFile == null) {
                logger.info("Dict for '" + col.getName() + "' not pre-built.");
                return null;
            }

            try (SequenceFile.Reader reader = new SequenceFile.Reader(HadoopUtil.getCurrentConfiguration(),
                    SequenceFile.Reader.file(dictFile))) {
                NullWritable key = NullWritable.get();
                BytesWritable value = new BytesWritable();
                reader.next(key, value);

                ByteBuffer buffer = new ByteArray(value.getBytes()).asBuffer();
                try (DataInputStream is = new DataInputStream(new ByteBufferBackedInputStream(buffer))) {
                    String dictClassName = is.readUTF();
                    Dictionary<String> dict = (Dictionary<String>) ClassUtil.newInstance(dictClassName);
                    dict.readFields(is);
                    logger.info("DictionaryProvider read dict from file: " + dictFile);
                    return dict;
                }
            }
        }
    });

    return 0;
}

From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.java

License:Apache License

@Override
public void doReduce(SelfDefineSortableKey skey, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    Text key = skey.getText();// w  w  w.  j  av a  2  s .c  o  m
    if (isStatistics) {
        // for hll
        long cuboidId = Bytes.toLong(key.getBytes(), 1, Bytes.SIZEOF_LONG);
        for (Text value : values) {
            HLLCounter hll = new HLLCounter(cubeConfig.getCubeStatsHLLPrecision());
            ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength());
            hll.readRegisters(bf);

            totalRowsBeforeMerge += hll.getCountEstimate();

            if (cuboidId == baseCuboidId) {
                baseCuboidRowCountInMappers.add(hll.getCountEstimate());
            }

            if (cuboidHLLMap.get(cuboidId) != null) {
                cuboidHLLMap.get(cuboidId).merge(hll);
            } else {
                cuboidHLLMap.put(cuboidId, hll);
            }
        }
    } else if (isPartitionCol) {
        // partition col
        String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1);
        logAFewRows(value);
        long time = DateFormat.stringToMillis(value);
        timeMinValue = Math.min(timeMinValue, time);
        timeMaxValue = Math.max(timeMaxValue, time);
    } else {
        // normal col
        if (buildDictInReducer) {
            String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1);
            logAFewRows(value);
            builder.addValue(value);
        } else {
            byte[] keyBytes = Bytes.copy(key.getBytes(), 1, key.getLength() - 1);
            // output written to baseDir/colName/-r-00000 (etc)
            String fileName = col.getIdentity() + "/";
            mos.write(BatchConstants.CFG_OUTPUT_COLUMN, NullWritable.get(), new Text(keyBytes), fileName);
        }
    }

    rowCount++;
}

From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.java

License:Apache License

private void outputPartitionInfo() throws IOException, InterruptedException {
    if (col != null) {
        // output written to baseDir/colName/colName.pci-r-00000 (etc)
        String partitionFileName = col.getIdentity() + "/" + col.getName() + PARTITION_COL_INFO_FILE_POSTFIX;

        mos.write(BatchConstants.CFG_OUTPUT_PARTITION, NullWritable.get(), new LongWritable(timeMinValue),
                partitionFileName);/*  www  . j  ava2s . c o  m*/
        mos.write(BatchConstants.CFG_OUTPUT_PARTITION, NullWritable.get(), new LongWritable(timeMaxValue),
                partitionFileName);
        logger.info("write partition info for col : " + col.getName() + "  minValue:" + timeMinValue
                + " maxValue:" + timeMaxValue);
    }
}

From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.java

License:Apache License

private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream outputStream = new DataOutputStream(baos);) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);//from   w w w  .j a v  a  2  s. c o m

        mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new BytesWritable(baos.toByteArray()),
                dictFileName);
    }
}

From source file:org.apache.kylin.engine.mr.steps.UHCDictionaryMapper.java

License:Apache License

@Override
public void doMap(NullWritable key, Text value, Context context) throws IOException, InterruptedException {
    tmpBuf.clear();//  ww  w.  j  a va  2s .c  om
    int size = value.getLength() + 1;
    if (size >= tmpBuf.capacity()) {
        tmpBuf = ByteBuffer.allocate(countNewSize(tmpBuf.capacity(), size));
    }
    tmpBuf.put(Bytes.toBytes(index)[3]);
    tmpBuf.put(value.getBytes(), 0, value.getLength());
    outputKey.set(tmpBuf.array(), 0, tmpBuf.position());

    sortableKey.init(outputKey, type);
    context.write(sortableKey, NullWritable.get());
}

From source file:org.apache.kylin.engine.mr.steps.UHCDictionaryReducer.java

License:Apache License

private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream outputStream = new DataOutputStream(baos);) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);/*from   ww  w.jav a  2  s.  c o m*/

        mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(),
                new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
    }
    mos.close();
}

From source file:org.apache.kylin.storage.hbase.steps.CreateHTableJob.java

License:Apache License

protected static void saveHFileSplits(final List<HashMap<Long, Double>> innerRegionSplits, int mbPerRegion,
        final Path outputFolder, final KylinConfig kylinConfig) throws IOException {

    if (outputFolder == null) {
        logger.warn("outputFolder for hfile split file is null, skip inner region split");
        return;/*  ww w  .j a  v  a  2 s . co m*/
    }

    // note read-write separation, respect HBase FS here
    Configuration hbaseConf = HBaseConnection.getCurrentHBaseConfiguration();
    FileSystem fs = FileSystem.get(hbaseConf);
    if (fs.exists(outputFolder) == false) {
        fs.mkdirs(outputFolder);
    }

    final float hfileSizeGB = kylinConfig.getHBaseHFileSizeGB();
    float hfileSizeMB = hfileSizeGB * 1024;
    if (hfileSizeMB > mbPerRegion) {
        hfileSizeMB = mbPerRegion;
    }

    // keep the tweak for sandbox test
    if (hfileSizeMB > 0.0 && kylinConfig.isDevEnv()) {
        hfileSizeMB = mbPerRegion / 2;
    }

    int compactionThreshold = Integer.valueOf(hbaseConf.get("hbase.hstore.compactionThreshold", "3"));
    logger.info("hbase.hstore.compactionThreshold is " + compactionThreshold);
    if (hfileSizeMB > 0.0 && hfileSizeMB * compactionThreshold < mbPerRegion) {
        hfileSizeMB = mbPerRegion / compactionThreshold;
    }

    if (hfileSizeMB <= 0) {
        hfileSizeMB = mbPerRegion;
    }
    logger.info("hfileSizeMB:" + hfileSizeMB);
    final Path hfilePartitionFile = new Path(outputFolder, "part-r-00000_hfile");
    short regionCount = (short) innerRegionSplits.size();

    List<byte[]> splits = Lists.newArrayList();
    for (int i = 0; i < regionCount; i++) {
        if (i > 0) {
            // skip 0
            byte[] split = new byte[RowConstants.ROWKEY_SHARDID_LEN];
            BytesUtil.writeUnsigned(i, split, 0, RowConstants.ROWKEY_SHARDID_LEN);
            splits.add(split); // split by region;
        }

        HashMap<Long, Double> cuboidSize = innerRegionSplits.get(i);
        List<Long> allCuboids = Lists.newArrayList();
        allCuboids.addAll(cuboidSize.keySet());
        Collections.sort(allCuboids);

        double accumulatedSize = 0;
        int j = 0;
        for (Long cuboid : allCuboids) {
            if (accumulatedSize >= hfileSizeMB) {
                logger.info(String.format("Region %d's hfile %d size is %.2f mb", i, j, accumulatedSize));
                byte[] split = new byte[RowConstants.ROWKEY_SHARD_AND_CUBOID_LEN];
                BytesUtil.writeUnsigned(i, split, 0, RowConstants.ROWKEY_SHARDID_LEN);
                System.arraycopy(Bytes.toBytes(cuboid), 0, split, RowConstants.ROWKEY_SHARDID_LEN,
                        RowConstants.ROWKEY_CUBOIDID_LEN);
                splits.add(split);
                accumulatedSize = 0;
                j++;
            }
            accumulatedSize += cuboidSize.get(cuboid);
        }

    }

    SequenceFile.Writer hfilePartitionWriter = SequenceFile.createWriter(hbaseConf,
            SequenceFile.Writer.file(hfilePartitionFile),
            SequenceFile.Writer.keyClass(ImmutableBytesWritable.class),
            SequenceFile.Writer.valueClass(NullWritable.class));

    for (int i = 0; i < splits.size(); i++) {
        hfilePartitionWriter.append(new ImmutableBytesWritable(splits.get(i)), NullWritable.get());
    }
    hfilePartitionWriter.close();
}

From source file:org.apache.kylin.storage.hbase.steps.RangeKeyDistributionReducer.java

License:Apache License

@Override
protected void doCleanup(Context context) throws IOException, InterruptedException {
    int nRegion = Math.round((float) gbPoints.size() / cut);
    nRegion = Math.max(minRegionCount, nRegion);
    nRegion = Math.min(maxRegionCount, nRegion);

    int gbPerRegion = gbPoints.size() / nRegion;
    gbPerRegion = Math.max(1, gbPerRegion);

    if (hfileSizeGB <= 0) {
        hfileSizeGB = gbPerRegion;//ww  w . ja va  2  s  .  c om
    }
    int hfilePerRegion = (int) (gbPerRegion / hfileSizeGB);
    hfilePerRegion = Math.max(1, hfilePerRegion);

    System.out.println(nRegion + " regions");
    System.out.println(gbPerRegion + " GB per region");
    System.out.println(hfilePerRegion + " hfile per region");

    Path hfilePartitionFile = new Path(output + "/part-r-00000_hfile");
    SequenceFile.Writer hfilePartitionWriter = new SequenceFile.Writer(
            hfilePartitionFile.getFileSystem(context.getConfiguration()), context.getConfiguration(),
            hfilePartitionFile, ImmutableBytesWritable.class, NullWritable.class);
    int hfileCountInOneRegion = 0;
    for (int i = hfileSizeGB; i < gbPoints.size(); i += hfileSizeGB) {
        hfilePartitionWriter.append(new ImmutableBytesWritable(gbPoints.get(i).getBytes()), NullWritable.get());
        if (++hfileCountInOneRegion >= hfilePerRegion) {
            Text key = gbPoints.get(i);
            outputValue.set(i);
            System.out.println(StringUtils.byteToHexString(key.getBytes()) + "\t" + outputValue.get());
            context.write(key, outputValue);

            hfileCountInOneRegion = 0;
        }
    }
    hfilePartitionWriter.close();
}

From source file:org.apache.kylin.storage.hbase.steps.SparkCubeHFile.java

License:Apache License

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    final Path partitionFilePath = new Path(optionsHelper.getOptionValue(OPTION_PARTITION_FILE_PATH));
    final String hbaseConfFile = optionsHelper.getOptionValue(AbstractHadoopJob.OPTION_HBASE_CONF_PATH);
    final String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"),
            KeyValueCreator.class, KeyValue.class, RowKeyWritable.class };

    SparkConf conf = new SparkConf().setAppName("Converting HFile for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration());
        if (!fs.exists(partitionFilePath)) {
            throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString());
        }/*from w w  w  . ja  va  2s.c  o  m*/

        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
        final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());

        final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
        final CubeDesc cubeDesc = cubeInstance.getDescriptor();
        final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

        final MeasureCodec inputCodec = new MeasureCodec(cubeDesc.getMeasures());
        final List<KeyValueCreator> keyValueCreators = Lists.newArrayList();

        for (HBaseColumnFamilyDesc cfDesc : cubeDesc.getHbaseMapping().getColumnFamily()) {
            for (HBaseColumnDesc colDesc : cfDesc.getColumns()) {
                keyValueCreators.add(new KeyValueCreator(cubeDesc, colDesc));
            }
        }

        final int cfNum = keyValueCreators.size();
        final boolean quickPath = (keyValueCreators.size() == 1) && keyValueCreators.get(0).isFullCopy;

        logger.info("Input path: {}", inputPath);
        logger.info("Output path: {}", outputPath);
        // read partition split keys
        List<RowKeyWritable> keys = new ArrayList<>();
        try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, partitionFilePath,
                sc.hadoopConfiguration())) {
            RowKeyWritable key = new RowKeyWritable();
            Writable value = NullWritable.get();
            while (reader.next(key, value)) {
                keys.add(key);
                logger.info(" ------- split key: {}", key);
                key = new RowKeyWritable(); // important, new an object!
            }
        }

        logger.info("There are {} split keys, totally {} hfiles", keys.size(), (keys.size() + 1));

        //HBase conf
        logger.info("Loading HBase configuration from:{}", hbaseConfFile);
        final Path hbaseConfFilePath = new Path(hbaseConfFile);
        final FileSystem hbaseClusterFs = hbaseConfFilePath.getFileSystem(sc.hadoopConfiguration());

        try (FSDataInputStream confInput = hbaseClusterFs.open(new Path(hbaseConfFile))) {
            Configuration hbaseJobConf = new Configuration();
            hbaseJobConf.addResource(confInput);
            hbaseJobConf.set("spark.hadoop.dfs.replication", "3"); // HFile, replication=3
            Job job = Job.getInstance(hbaseJobConf, cubeSegment.getStorageLocationIdentifier());

            FileOutputFormat.setOutputPath(job, new Path(outputPath));

            // inputPath has the same FileSystem as hbaseClusterFs when in HBase standalone mode
            JavaPairRDD<Text, Text> inputRDDs = SparkUtil.parseInputPath(inputPath, hbaseClusterFs, sc,
                    Text.class, Text.class);
            final JavaPairRDD<RowKeyWritable, KeyValue> hfilerdd;
            if (quickPath) {
                hfilerdd = inputRDDs
                        .mapToPair(new PairFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() {
                            @Override
                            public Tuple2<RowKeyWritable, KeyValue> call(Tuple2<Text, Text> textTextTuple2)
                                    throws Exception {
                                KeyValue outputValue = keyValueCreators.get(0).create(textTextTuple2._1,
                                        textTextTuple2._2.getBytes(), 0, textTextTuple2._2.getLength());
                                return new Tuple2<>(
                                        new RowKeyWritable(outputValue.createKeyOnly(false).getKey()),
                                        outputValue);
                            }
                        });
            } else {
                hfilerdd = inputRDDs
                        .flatMapToPair(new PairFlatMapFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() {
                            @Override
                            public Iterator<Tuple2<RowKeyWritable, KeyValue>> call(
                                    Tuple2<Text, Text> textTextTuple2) throws Exception {

                                List<Tuple2<RowKeyWritable, KeyValue>> result = Lists
                                        .newArrayListWithExpectedSize(cfNum);
                                Object[] inputMeasures = new Object[cubeDesc.getMeasures().size()];
                                inputCodec.decode(ByteBuffer.wrap(textTextTuple2._2.getBytes(), 0,
                                        textTextTuple2._2.getLength()), inputMeasures);

                                for (int i = 0; i < cfNum; i++) {
                                    KeyValue outputValue = keyValueCreators.get(i).create(textTextTuple2._1,
                                            inputMeasures);
                                    result.add(new Tuple2<>(
                                            new RowKeyWritable(outputValue.createKeyOnly(false).getKey()),
                                            outputValue));
                                }

                                return result.iterator();
                            }
                        });
            }

            hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys),
                    RowKeyWritable.RowKeyComparator.INSTANCE)
                    .mapToPair(
                            new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
                                @Override
                                public Tuple2<ImmutableBytesWritable, KeyValue> call(
                                        Tuple2<RowKeyWritable, KeyValue> rowKeyWritableKeyValueTuple2)
                                        throws Exception {
                                    return new Tuple2<>(
                                            new ImmutableBytesWritable(
                                                    rowKeyWritableKeyValueTuple2._2.getKey()),
                                            rowKeyWritableKeyValueTuple2._2);
                                }
                            })
                    .saveAsNewAPIHadoopDataset(job.getConfiguration());
        }

        logger.info("HDFS: Number of bytes written={}", jobListener.metrics.getBytesWritten());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.HDFS_BYTES_WRITTEN,
                String.valueOf(jobListener.metrics.getBytesWritten()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
    }
}

From source file:org.apache.mahout.cf.taste.hadoop.pseudo.UserIDsMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    int comma = line.indexOf(',');
    long userID = comma >= 0 ? Long.parseLong(line.substring(0, comma)) : Long.parseLong(line);
    context.write(new VarLongWritable(userID), NullWritable.get());
}