List of usage examples for org.apache.hadoop.io NullWritable get
public static NullWritable get()
From source file:ml.shifu.shifu.core.binning.UpdateBinningInfoReducer.java
License:Apache License
@Override protected void reduce(IntWritable key, Iterable<BinningInfoWritable> values, Context context) throws IOException, InterruptedException { long start = System.currentTimeMillis(); double sum = 0d; double squaredSum = 0d; double tripleSum = 0d; double quarticSum = 0d; long count = 0L, missingCount = 0L; double min = Double.MAX_VALUE, max = Double.MIN_VALUE; List<Double> binBoundaryList = null; List<String> binCategories = null; long[] binCountPos = null; long[] binCountNeg = null; double[] binWeightPos = null; double[] binWeightNeg = null; ColumnConfig columnConfig = this.columnConfigList.get(key.get()); int binSize = 0; for (BinningInfoWritable info : values) { if (info.isNumeric() && binBoundaryList == null) { binBoundaryList = info.getBinBoundaries(); binSize = binBoundaryList.size(); binCountPos = new long[binSize + 1]; binCountNeg = new long[binSize + 1]; binWeightPos = new double[binSize + 1]; binWeightNeg = new double[binSize + 1]; }// ww w.j a va 2s . c o m if (!info.isNumeric() && binCategories == null) { binCategories = info.getBinCategories(); binSize = binCategories.size(); binCountPos = new long[binSize + 1]; binCountNeg = new long[binSize + 1]; binWeightPos = new double[binSize + 1]; binWeightNeg = new double[binSize + 1]; } count += info.getTotalCount(); missingCount += info.getMissingCount(); // for numeric, such sums are OK, for categorical, such values are all 0, should be updated by using // binCountPos and binCountNeg sum += info.getSum(); squaredSum += info.getSquaredSum(); tripleSum += info.getTripleSum(); quarticSum += info.getQuarticSum(); if (Double.compare(max, info.getMax()) < 0) { max = info.getMax(); } if (Double.compare(min, info.getMin()) > 0) { min = info.getMin(); } for (int i = 0; i < (binSize + 1); i++) { binCountPos[i] += info.getBinCountPos()[i]; binCountNeg[i] += info.getBinCountNeg()[i]; binWeightPos[i] += info.getBinWeightPos()[i]; binWeightNeg[i] += info.getBinWeightNeg()[i]; } } double[] binPosRate = computePosRate(binCountPos, binCountNeg); String binBounString = null; if (columnConfig.isCategorical()) { if (binCategories.size() == 0 || binCategories.size() > MAX_CATEGORICAL_BINC_COUNT) { LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(), binCategories.size()); return; } binBounString = Base64Utils.base64Encode( "[" + StringUtils.join(binCategories, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]"); // recompute such value for categorical variables min = Double.MAX_VALUE; max = Double.MIN_VALUE; sum = 0d; squaredSum = 0d; for (int i = 0; i < binPosRate.length; i++) { if (!Double.isNaN(binPosRate[i])) { if (Double.compare(max, binPosRate[i]) < 0) { max = binPosRate[i]; } if (Double.compare(min, binPosRate[i]) > 0) { min = binPosRate[i]; } long binCount = binCountPos[i] + binCountNeg[i]; sum += binPosRate[i] * binCount; double squaredVal = binPosRate[i] * binPosRate[i]; squaredSum += squaredVal * binCount; tripleSum += squaredVal * binPosRate[i] * binCount; quarticSum += squaredVal * squaredVal * binCount; } } } else { if (binBoundaryList.size() == 0) { LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(), binBoundaryList.size()); return; } binBounString = binBoundaryList.toString(); } ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binCountNeg, binCountPos); ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(binWeightNeg, binWeightPos); // TODO & FIXME do we need validCount(totalCount - missingValueCount) for mean and stddev??? double mean = sum / count; double stdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / count + EPS) / (count - 1))); double aStdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / count + EPS) / count)); double skewness = ColumnStatsCalculator.computeSkewness(count, mean, aStdDev, sum, squaredSum, tripleSum); double kurtosis = ColumnStatsCalculator.computeKurtosis(count, mean, aStdDev, sum, squaredSum, tripleSum, quarticSum); sb.append(key.get()).append(Constants.DEFAULT_DELIMITER).append(binBounString) .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountNeg)) .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountPos)) .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(new double[0])) .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binPosRate)) .append(Constants.DEFAULT_DELIMITER).append(df.format(columnCountMetrics.getKs())) .append(Constants.DEFAULT_DELIMITER).append(df.format(columnWeightMetrics.getIv())) .append(Constants.DEFAULT_DELIMITER).append(df.format(max)).append(Constants.DEFAULT_DELIMITER) .append(df.format(min)).append(Constants.DEFAULT_DELIMITER).append(df.format(mean)) .append(Constants.DEFAULT_DELIMITER).append(df.format(stdDev)).append(Constants.DEFAULT_DELIMITER) .append(columnConfig.isCategorical() ? "C" : "N").append(Constants.DEFAULT_DELIMITER) .append(df.format(mean)).append(Constants.DEFAULT_DELIMITER).append(missingCount) .append(Constants.DEFAULT_DELIMITER).append(count).append(Constants.DEFAULT_DELIMITER) .append(missingCount * 1.0d / count).append(Constants.DEFAULT_DELIMITER) .append(Arrays.toString(binWeightNeg)).append(Constants.DEFAULT_DELIMITER) .append(Arrays.toString(binWeightPos)).append(Constants.DEFAULT_DELIMITER) .append(columnCountMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER) .append(columnWeightMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER) .append(columnWeightMetrics.getKs()).append(Constants.DEFAULT_DELIMITER) .append(columnCountMetrics.getIv()).append(Constants.DEFAULT_DELIMITER) .append(columnCountMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER) .append(columnWeightMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER) .append(skewness).append(Constants.DEFAULT_DELIMITER).append(kurtosis); outputValue.set(sb.toString()); context.write(NullWritable.get(), outputValue); sb.delete(0, sb.length()); LOG.debug("Time:{}", (System.currentTimeMillis() - start)); }
From source file:ml.shifu.shifu.core.posttrain.PostTrainMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String valueStr = value.toString(); // StringUtils.isBlank is not used here to avoid import new jar if (valueStr == null || valueStr.length() == 0 || valueStr.trim().length() == 0) { LOG.warn("Empty input."); return;//from w ww .j a va 2 s. co m } if (!this.dataPurifier.isFilter(valueStr)) { return; } String[] units = CommonUtils.split(valueStr, this.modelConfig.getDataSetDelimiter()); // tagColumnNum should be in units array, if not IndexOutofBoundException String tag = CommonUtils.trimTag(units[this.tagColumnNum]); if (!this.tags.contains(tag)) { if (System.currentTimeMillis() % 20 == 0) { LOG.warn("Data with invalid tag is ignored in post train, invalid tag: {}.", tag); } context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L); return; } Map<String, String> rawDataMap = buildRawDataMap(units); CaseScoreResult csr = this.modelRunner.compute(rawDataMap); // store score value StringBuilder sb = new StringBuilder(500); sb.append(csr.getAvgScore()).append(Constants.DEFAULT_DELIMITER).append(csr.getMaxScore()) .append(Constants.DEFAULT_DELIMITER).append(csr.getMinScore()).append(Constants.DEFAULT_DELIMITER); for (Double score : csr.getScores()) { sb.append(score).append(Constants.DEFAULT_DELIMITER); } List<String> metaList = modelConfig.getMetaColumnNames(); for (String meta : metaList) { sb.append(rawDataMap.get(meta)).append(Constants.DEFAULT_DELIMITER); } sb.deleteCharAt(sb.length() - Constants.DEFAULT_DELIMITER.length()); this.outputValue.set(sb.toString()); this.mos.write(Constants.POST_TRAIN_OUTPUT_SCORE, NullWritable.get(), this.outputValue); for (int i = 0; i < headers.length; i++) { ColumnConfig config = this.columnConfigList.get(i); if (!config.isMeta() && !config.isTarget() && config.isFinalSelect()) { int binNum = BinUtils.getBinNum(config, units[i]); List<BinStats> feaureStatistics = this.variableStatsMap.get(config.getColumnNum()); BinStats bs = null; if (binNum == -1) { // if -1, means invalid numeric value like null or empty, last one is for empty stats. bs = feaureStatistics.get(feaureStatistics.size() - 1); } else { bs = feaureStatistics.get(binNum); } // bs should not be null as already initialized in setup bs.setBinSum(csr.getAvgScore() + bs.getBinSum()); bs.setBinCnt(1L + bs.getBinCnt()); } } }
From source file:net.jarcec.sqoop.data.gen.mr.GeneratorMapper.java
License:Apache License
@Override protected void map(LongWritable key, LongWritable value, Context context) throws IOException, InterruptedException { long from = key.get(); long to = value.get(); random = new SecureRandom(); decimal = new DecimalFormat("###.###"); date = new SimpleDateFormat("yyyy-MM-dd"); time = new SimpleDateFormat("HH:mm:ss"); datetime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String[] types = context.getConfiguration().get(Constants.TYPES).split(","); String[] values = new String[types.length]; for (long i = from; i < to; i++) { context.progress();// www .ja va 2s .c o m int y = 0; for (String type : types) { if ("id".equals(type)) { values[y] = String.valueOf(i); } else if ("s50".equals(type)) { values[y] = generateString(50); } else if ("i".equals(type)) { values[y] = generateInteger(); } else if ("f".equals(type)) { values[y] = generateFloat(250, 31); } else if ("d".equals(type)) { values[y] = generateDate(); } else if ("t".equals(type)) { values[y] = generateTime(); } else if ("dt".equals(type)) { values[y] = generateDateTime(); } else if ("s255".equals(type)) { values[y] = generateString(255); } else { throw new RuntimeException("Unknown type: " + type); } y++; } context.write(new Text(StringUtils.join(values, ",")), NullWritable.get()); } }
From source file:nl.bioinf.wvanhelvoirt.HadoopPhredCalculator.CombineReducer.java
License:Open Source License
/** * Override method that processes all mapper outputs to one array, ready to be written as file. * * @param key NullWritable not used. * @param values Iterable with AveragePhredCalculator items from each mapper. * @param context Context containing job information. * @throws IOException When something went wrong. * @throws InterruptedException When connection was interrupted. *//*from ww w .j ava 2 s. c o m*/ @Override public void reduce(NullWritable key, Iterable<TextArrayWritable> values, Context context) throws IOException, InterruptedException { LinkedList<Float> sizablePhredArray = new LinkedList<>(); LinkedList<Integer> sizableCountArray = new LinkedList<>(); // For each Mapper output, add the values to photonCountMatrix. for (TextArrayWritable value : values) { Text[] asciiArray = value.get(); // IntWritable arrays containing values, add these to the sizableArray only if not empty. if (asciiArray.length > 0) { for (int i = 0; i < asciiArray.length; i++) { try { sizablePhredArray.set(i, sizablePhredArray.get(i) + Float.parseFloat(asciiArray[i].toString().split("\\|")[0])); sizableCountArray.set(i, sizableCountArray.get(i) + Integer.parseInt(asciiArray[i].toString().split("\\|")[1])); } catch (IndexOutOfBoundsException e) { sizablePhredArray.add(i, Float.parseFloat(asciiArray[i].toString().split("\\|")[0])); sizableCountArray.add(i, Integer.parseInt(asciiArray[i].toString().split("\\|")[1])); } } } } // Instantiate the Text array and add lines. Text[] phredCount = new Text[(sizablePhredArray.size() + 1)]; phredCount[0] = new Text("base_position\taverage_phred_score"); for (int i = 0; i < sizablePhredArray.size(); i++) { phredCount[i + 1] = new Text((i + 1) + "\t" + (sizablePhredArray.get(i) / sizableCountArray.get(i))); } // Add the Text array to the ArrayWritable wrapper and return the result. context.write(NullWritable.get(), new TextArrayWritable(Text.class, phredCount)); }
From source file:nl.bioinf.wvanhelvoirt.HadoopPhredCalculator.ReadMapper.java
License:Open Source License
/** * Override method that processes one RecordReader item and send it's output to the reducing step. * * @param key LongWritable as key./* w w w . j ava 2 s . c o m*/ * @param value Text containing reads (one read is 4 lines) from the fastq file. * @param context Context containing job information. * @throws IOException When something went wrong. * @throws InterruptedException When connection was interrupted. */ @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // Set the configuration, read data and ascii base value. Configuration conf = context.getConfiguration(); int asciiBase = conf.getInt("ascii.base", 64); String[] readData = value.toString().split("\\n"); LinkedList<Float> sizablePhredArray = new LinkedList<>(); LinkedList<Integer> sizableCountArray = new LinkedList<>(); for (int i = 0; i < readData.length; i += 4) { // If the length of the base line equals the length of the phred line. if (readData[i + 1].length() == readData[i + 3].length()) { // Add the characters array to the IntWritable array. for (int j = 0; j < readData[i + 3].length(); j++) { try { sizablePhredArray.set(j, sizablePhredArray.get(j) + ((float) readData[i + 3].charAt(j) - asciiBase)); sizableCountArray.set(j, sizableCountArray.get(j) + 1); } catch (IndexOutOfBoundsException e) { sizablePhredArray.add(j, ((float) readData[i + 3].charAt(j) - asciiBase)); sizableCountArray.add(j, 1); } } } } // Instantiate the Text array and add lines. Text[] phredCount = new Text[sizablePhredArray.size()]; for (int i = 0; i < sizablePhredArray.size(); i++) { phredCount[i] = new Text(sizablePhredArray.get(i) + "|" + sizableCountArray.get(i)); } // Add the IntWritable array too the TextArrayWritable wrapper and return the result. context.write(NullWritable.get(), new TextArrayWritable(Text.class, phredCount)); }
From source file:nl.gridline.free.taalserver.CountDocumentsReduce.java
License:Apache License
@Override protected void reduce(LongWritable key, java.lang.Iterable<VarLongWritable> values, Context context) throws IOException, InterruptedException { int numberOfDocuments = 0; Iterator<VarLongWritable> i = values.iterator(); while (i.hasNext()) // VarLongWritable writable : values) {/* w ww .j a va 2s .c o m*/ numberOfDocuments++; i.next(); } context.write(new VarIntWritable(numberOfDocuments), NullWritable.get()); context.progress(); }
From source file:nl.gridline.free.taalserver.TFIdfSerializeMap.java
License:Apache License
@Override protected void map(WordDocId key, TFIdfWritable value, Context context) throws IOException, InterruptedException { // userID,itemID,preference StringBuilder b = new StringBuilder().append(key.getWord().hashCode()).append(',').append(key.getDocId()) .append(',').append(value.getTdIdf()); out.set(b.toString());/*from w w w . j a v a2 s.c o m*/ context.write(out, NullWritable.get()); context.progress(); }
From source file:nl.gridline.free.taalserver.TFIdfSerializeReduce.java
License:Apache License
@Override protected void reduce(Text keyIn, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { // just put the values through: context.write(keyIn, NullWritable.get()); context.progress();/*w ww . java2s.c om*/ }
From source file:nl.gridline.zieook.inx.movielens.items.FilterDataReduce.java
License:Apache License
@Override protected void reduce(Text key, Iterable<Text> values, Context context) throws java.io.IOException, InterruptedException { // key == itemID // value == [allowed,<userID,preference>] // output key = userID,itemID,preference // put values in a list: Set<String> valueSet = new HashSet<String>(); String itemId = key.toString(); for (Text s : values) { valueSet.add(s.toString());// w ww. ja va 2 s .c o m } if (!shouldFilter || valueSet.contains("allowed")) { // write all but 'allowed' for (String s : valueSet) { if (!"allowed".equals(s)) { String[] data = s.split(","); if (data.length == 2) { outKey.set(data[0] + "," + itemId + "," + data[1]); } else if (data.length == 1) { outKey.set(data[0] + "," + itemId); } else { LOG.error("Failed to read item, this is probably an error: value = '" + s + "' key = '" + itemId + "'"); } context.write(outKey, NullWritable.get()); } } } }
From source file:nl.sanoma.hdt.report.generator.ReportGeneratorReducerTest.java
License:Open Source License
/** * Test of reduce method, of class ReportGeneratorReducer. *//* w w w. j av a 2 s .com*/ @Test public void testReduce() throws Exception { System.out.println("reduce"); KeyData key = new KeyData(1, "candy"); Iterable<ValueData> values = Arrays.asList(new ValueData("Q1", "fruit", 3.0), new ValueData("Q2", "fruit", 1.0), new ValueData("Q1", "grocery", 2.0), new ValueData("Q3", "grocery", 2.0), new ValueData("Q4", "grocery", 2.0)); Reducer.Context context = mock(ReportGeneratorReducer.Context.class); ; ReportGeneratorReducer instance = new ReportGeneratorReducer(); instance.reduce(key, values, context); verify(context).write(NullWritable.get(), new Text("1\tgrocery\t\t\t5.0\t1.0\t2.0\t2.0")); }