List of usage examples for org.apache.hadoop.io IntWritable get
public int get()
From source file:ml.shifu.shifu.core.binning.UpdateBinningInfoReducer.java
License:Apache License
@Override protected void reduce(IntWritable key, Iterable<BinningInfoWritable> values, Context context) throws IOException, InterruptedException { long start = System.currentTimeMillis(); double sum = 0d; double squaredSum = 0d; double tripleSum = 0d; double quarticSum = 0d; long count = 0L, missingCount = 0L; double min = Double.MAX_VALUE, max = Double.MIN_VALUE; List<Double> binBoundaryList = null; List<String> binCategories = null; long[] binCountPos = null; long[] binCountNeg = null; double[] binWeightPos = null; double[] binWeightNeg = null; ColumnConfig columnConfig = this.columnConfigList.get(key.get()); int binSize = 0; for (BinningInfoWritable info : values) { if (info.isNumeric() && binBoundaryList == null) { binBoundaryList = info.getBinBoundaries(); binSize = binBoundaryList.size(); binCountPos = new long[binSize + 1]; binCountNeg = new long[binSize + 1]; binWeightPos = new double[binSize + 1]; binWeightNeg = new double[binSize + 1]; }//w w w . ja v a 2 s. c o m if (!info.isNumeric() && binCategories == null) { binCategories = info.getBinCategories(); binSize = binCategories.size(); binCountPos = new long[binSize + 1]; binCountNeg = new long[binSize + 1]; binWeightPos = new double[binSize + 1]; binWeightNeg = new double[binSize + 1]; } count += info.getTotalCount(); missingCount += info.getMissingCount(); // for numeric, such sums are OK, for categorical, such values are all 0, should be updated by using // binCountPos and binCountNeg sum += info.getSum(); squaredSum += info.getSquaredSum(); tripleSum += info.getTripleSum(); quarticSum += info.getQuarticSum(); if (Double.compare(max, info.getMax()) < 0) { max = info.getMax(); } if (Double.compare(min, info.getMin()) > 0) { min = info.getMin(); } for (int i = 0; i < (binSize + 1); i++) { binCountPos[i] += info.getBinCountPos()[i]; binCountNeg[i] += info.getBinCountNeg()[i]; binWeightPos[i] += info.getBinWeightPos()[i]; binWeightNeg[i] += info.getBinWeightNeg()[i]; } } double[] binPosRate = computePosRate(binCountPos, binCountNeg); String binBounString = null; if (columnConfig.isCategorical()) { if (binCategories.size() == 0 || binCategories.size() > MAX_CATEGORICAL_BINC_COUNT) { LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(), binCategories.size()); return; } binBounString = Base64Utils.base64Encode( "[" + StringUtils.join(binCategories, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]"); // recompute such value for categorical variables min = Double.MAX_VALUE; max = Double.MIN_VALUE; sum = 0d; squaredSum = 0d; for (int i = 0; i < binPosRate.length; i++) { if (!Double.isNaN(binPosRate[i])) { if (Double.compare(max, binPosRate[i]) < 0) { max = binPosRate[i]; } if (Double.compare(min, binPosRate[i]) > 0) { min = binPosRate[i]; } long binCount = binCountPos[i] + binCountNeg[i]; sum += binPosRate[i] * binCount; double squaredVal = binPosRate[i] * binPosRate[i]; squaredSum += squaredVal * binCount; tripleSum += squaredVal * binPosRate[i] * binCount; quarticSum += squaredVal * squaredVal * binCount; } } } else { if (binBoundaryList.size() == 0) { LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(), binBoundaryList.size()); return; } binBounString = binBoundaryList.toString(); } ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binCountNeg, binCountPos); ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(binWeightNeg, binWeightPos); // TODO & FIXME do we need validCount(totalCount - missingValueCount) for mean and stddev??? double mean = sum / count; double stdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / count + EPS) / (count - 1))); double aStdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / count + EPS) / count)); double skewness = ColumnStatsCalculator.computeSkewness(count, mean, aStdDev, sum, squaredSum, tripleSum); double kurtosis = ColumnStatsCalculator.computeKurtosis(count, mean, aStdDev, sum, squaredSum, tripleSum, quarticSum); sb.append(key.get()).append(Constants.DEFAULT_DELIMITER).append(binBounString) .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountNeg)) .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountPos)) .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(new double[0])) .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binPosRate)) .append(Constants.DEFAULT_DELIMITER).append(df.format(columnCountMetrics.getKs())) .append(Constants.DEFAULT_DELIMITER).append(df.format(columnWeightMetrics.getIv())) .append(Constants.DEFAULT_DELIMITER).append(df.format(max)).append(Constants.DEFAULT_DELIMITER) .append(df.format(min)).append(Constants.DEFAULT_DELIMITER).append(df.format(mean)) .append(Constants.DEFAULT_DELIMITER).append(df.format(stdDev)).append(Constants.DEFAULT_DELIMITER) .append(columnConfig.isCategorical() ? "C" : "N").append(Constants.DEFAULT_DELIMITER) .append(df.format(mean)).append(Constants.DEFAULT_DELIMITER).append(missingCount) .append(Constants.DEFAULT_DELIMITER).append(count).append(Constants.DEFAULT_DELIMITER) .append(missingCount * 1.0d / count).append(Constants.DEFAULT_DELIMITER) .append(Arrays.toString(binWeightNeg)).append(Constants.DEFAULT_DELIMITER) .append(Arrays.toString(binWeightPos)).append(Constants.DEFAULT_DELIMITER) .append(columnCountMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER) .append(columnWeightMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER) .append(columnWeightMetrics.getKs()).append(Constants.DEFAULT_DELIMITER) .append(columnCountMetrics.getIv()).append(Constants.DEFAULT_DELIMITER) .append(columnCountMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER) .append(columnWeightMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER) .append(skewness).append(Constants.DEFAULT_DELIMITER).append(kurtosis); outputValue.set(sb.toString()); context.write(NullWritable.get(), outputValue); sb.delete(0, sb.length()); LOG.debug("Time:{}", (System.currentTimeMillis() - start)); }
From source file:ml.shifu.shifu.core.correlation.CorrelationReducer.java
License:Apache License
@Override protected void reduce(IntWritable key, Iterable<CorrelationWritable> values, Context context) throws IOException, InterruptedException { // build final correlation column info CorrelationWritable finalCw = null;/*from ww w. ja v a2s . com*/ Iterator<CorrelationWritable> cwIt = values.iterator(); while (cwIt.hasNext()) { CorrelationWritable cw = cwIt.next(); if (!cw.isValid()) { // In this case, there is no need to process this one because all of inside value is null LOG.warn("Such CorrelationWritable has not been inited, so we ingore it"); continue; } if (finalCw == null) { finalCw = initCw(cw.getAdjustCount().length); } finalCw.setColumnIndex(cw.getColumnIndex()); finalCw.combine(cw); } if (finalCw == null) { LOG.warn("Key: {}, Reducer result is null because there is no useful correlationwritable from Mapper.", key.get()); return; } this.outputKey.set(key.get()); this.outputValue.set(new String(Base64.encodeBase64(objectToBytes(finalCw)), "utf-8")); context.write(outputKey, outputValue); }
From source file:ml.shifu.shifu.core.posttrain.FeatureImportanceReducer.java
License:Apache License
@Override protected void reduce(IntWritable key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { double sum = 0d; for (DoubleWritable dw : values) { sum += dw.get();// w w w . j a v a 2 s . c o m } this.variableStatsMap.put(key.get(), sum); }
From source file:mlbench.bayes.BayesUtils.java
License:Apache License
static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath);/*from ww w . j a v a2 s. c o m*/ SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); try { long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN); for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>( filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(freqWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; IntWritable key = record.getFirst(); LongWritable value = record.getSecond(); if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } featureCount++; Long[] counts = { featureCount, vectorCount }; return new Pair<Long[], List<Path>>(counts, chunkPaths); } finally { Closeables.close(freqWriter, false); } }
From source file:mlbench.bayes.train.WeightSummer.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException { parseArgs(args);//from w ww .j a v a 2 s . co m HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, (JobConf) config, inDir, rank); Vector weightsPerFeature = null; Vector weightsPerLabel = new DenseVector(labNum); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config, fsplit); IntWritable index = kvrr.createKey(); VectorWritable value = kvrr.createValue(); while (kvrr.next(index, value)) { Vector instance = value.get(); if (weightsPerFeature == null) { weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements()); } int label = index.get(); weightsPerFeature.assign(instance, Functions.PLUS); weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum()); } } if (weightsPerFeature != null) { MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature)); MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel)); } } else if (MPI_D.COMM_BIPARTITE_A != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); config.set(MAPRED_OUTPUT_DIR, outDirW); config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString()); ((JobConf) config).setOutputKeyClass(Text.class); ((JobConf) config).setOutputValueClass(VectorWritable.class); TaskAttemptContext taskContext = new TaskAttemptContextImpl(config, DataMPIUtil.getHadoopTaskAttemptID()); SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>(); FileSystem fs = FileSystem.get(config); Path output = new Path(config.get(MAPRED_OUTPUT_DIR)); FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext); RecordWriter<Text, VectorWritable> outrw = null; try { fcommitter.setupJob(taskContext); outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null); } catch (IOException e) { e.printStackTrace(); System.err.println("ERROR: Please set the HDFS configuration properly\n"); System.exit(-1); } Text key = null, newKey = null; VectorWritable point = null, newPoint = null; Vector vector = null; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (Text) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { } else if (!key.equals(newKey)) { outrw.write(key, new VectorWritable(vector)); vector = null; } if (vector == null) { vector = newPoint.get(); } else { vector.assign(newPoint.get(), Functions.PLUS); } key = newKey; point = newPoint; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { outrw.write(key, new VectorWritable(vector)); } outrw.close(null); if (fcommitter.needsTaskCommit(taskContext)) { fcommitter.commitTask(taskContext); } MPI_D.COMM_BIPARTITE_A.Barrier(); if (rank == 0) { Path resOut = new Path(outDir); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config); naiveBayesModel.serialize(resOut, config); } } MPI_D.Finalize(); }
From source file:mlbench.kmeans.KmeansInit.java
License:Apache License
/** * get the input values and choose the K clusters' centers * * @param dataPath/*w w w . java2 s . c o m*/ * @throws MPI_D_Exception * @throws IOException * @throws MPIException */ @SuppressWarnings("deprecation") private static void init(String args[], String dataPath, int kCluster, HashMap<String, String> conf) throws MPI_D_Exception, IOException, MPIException { MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config, dataPath, rank); // for record the initialized state for (FileSplit path : inputs) { SequenceFileInputFormat f = new SequenceFileInputFormat(); JobConf jobConf = new JobConf(confPath); Reporter r = new KmeansUtils.EmptyReport(); RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r); Random random = new Random(1000); LongWritable k = reader.createKey(); VectorWritable v = reader.createValue(); IntWritable cluster = new IntWritable(); while (reader.next(k, v)) { cluster.set(random.nextInt(kCluster)); MPI_D.Send(cluster, v); } reader.close(); } } else { IntWritable key = null, newKey = null; VectorWritable point = null, newPoint = null; double sum[] = null; int count = 0; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (IntWritable) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { sum = new double[newPoint.get().size()]; } else if (!key.equals(newKey)) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals); centers.add(oneCenter); sum = new double[point.get().size()]; count = 0; } key = newKey; point = newPoint; KmeansUtils.accumulate(sum, newPoint.get()); count++; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(key.get(), centerVals); centers.add(oneCenter); } transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.gatherCentersByP2P(centers); if (rank == 0) { OutputStream resOut = KmeansUtils.getOutputStream(outPath, config); DataOutput os = new DataOutputStream(resOut); for (PointVector centerPoint : centers) { os.write((centerPoint.toString() + "\n").getBytes()); } resOut.flush(); resOut.close(); } System.out.println("rank " + rank + " finish"); } MPI_D.Finalize(); }
From source file:mlbench.kmeans.KmeansIter.java
License:Apache License
/** * Calculate the new center iteratively// ww w.j a va 2 s. c o m * * @return true: finish; false: continue * @throws MPI_D_Exception * @throws MPIException * @throws IOException */ @SuppressWarnings("deprecation") private static void iterBody(String args[], HashMap<String, String> conf) throws MPI_D_Exception, MPIException, IOException { MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); if (rank == 0) { System.out.println(centerPath); DataInputStream in = KmeansUtils.readFromHDFSF(new Path(centerPath), config); String lineVal; try { while ((lineVal = in.readLine()) != null) { String lineSeq[] = lineVal.split(":"); PointVector p = new PointVector(Integer.valueOf(lineSeq[0]), format(lineSeq[1])); centers.add(p); } } catch (IOException e) { e.printStackTrace(); } finally { try { in.close(); } catch (IOException e) { e.printStackTrace(); } } } MPI_D.COMM_BIPARTITE_O.Barrier(); KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.broadcastCenters(centers); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config, dataPath, rank); double centerSum[][] = new double[kCluster][]; long centerPNum[] = new long[kCluster]; // for record the initialized state for (FileSplit path : inputs) { SequenceFileInputFormat f = new SequenceFileInputFormat(); JobConf jobConf = new JobConf(confPath); Reporter r = new KmeansUtils.EmptyReport(); RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r); LongWritable k = reader.createKey(); VectorWritable v = reader.createValue(); while (reader.next(k, v)) { int centerBelong = (int) getBelongPoint(v); // int i = (int) p.getStrClusterClass(); // double[] vals = p.getDoubleValue(); int len = v.get().size(); if (centerSum[centerBelong] == null) { centerSum[centerBelong] = new double[len]; } for (int j = 0; j < len; j++) { centerSum[centerBelong][j] += v.get().get(j); } centerPNum[centerBelong]++; } reader.close(); } for (int i = 0; i < centerPNum.length; i++) { if (centerSum[i] == null && centerPNum[i] == 0) { continue; } MPI_D.Send(new IntWritable(i), new KmeansCenters(centerPNum[i], centerSum[i])); } } else { centers.clear(); IntWritable key = null, newKey = null; KmeansCenters value = null, newValue = null; double sum[] = null; long count = 0; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (IntWritable) vals[0]; newValue = (KmeansCenters) vals[1]; if (key == null && value == null) { sum = new double[newValue.getVector().length]; } else if (!key.equals(newKey)) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = (double) sum[i] / count; } PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals); centers.add(oneCenter); sum = new double[value.getVector().length]; count = 0; } key = newKey; value = newValue; KmeansUtils.accumulate(sum, newValue.getVector()); count += Long.valueOf(newValue.getPointSize()); vals = MPI_D.Recv(); } if (newKey != null && newValue != null) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(key.get(), centerVals); centers.add(oneCenter); } KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.gatherCentersByP2P(centers); if (rank == 0) { OutputStream resOut = KmeansUtils.getOutputStream(outPath, config); DataOutput os = new DataOutputStream(resOut); for (PointVector centerPoint : centers) { os.write((centerPoint.toString() + "\n").getBytes()); } resOut.flush(); resOut.close(); } } MPI_D.Finalize(); }
From source file:mx.itam.metodos.lshclustering.MinhashEmitMapper.java
License:Apache License
@Override public void map(Text id, IntArrayWritable values, Context context) throws IOException, InterruptedException { for (int i = 0; i < functionsCount; i++) { hashValues[i] = Integer.MAX_VALUE; }// w ww . j a v a2 s .c o m for (int i = 0; i < functionsCount; i++) { HashFunction hf = functions[i]; for (Writable wr : values.get()) { IntWritable value = (IntWritable) wr; int hash = hf.hashInt(value.get()).asInt(); if (hash < hashValues[i]) { hashValues[i] = hash; } } } Text sketch = new Text(); Hasher hasher = lsh.newHasher(); int band = 0; for (int i = 0; i < functionsCount; i++) { hasher.putInt(hashValues[i]); if (i > 0 && (i % rows) == 0) { sketch.set(band + "-" + hasher.hash().toString()); context.write(new SecondarySortKey(sketch, id), id); hasher = lsh.newHasher(); band++; } } sketch.set(band + "-" + hasher.hash().toString()); context.write(new SecondarySortKey(sketch, id), id); }
From source file:mx.itam.metodos.minhashing.MinhashMapper.java
License:Apache License
@Override public void map(Text id, IntArrayWritable values, Context ctx) throws IOException, InterruptedException { for (int i = 0; i < functionsCount; i++) { hashValues[i] = Integer.MAX_VALUE; }/* w w w .j av a 2 s. c o m*/ for (int i = 0; i < functionsCount; i++) { HashFunction hf = functions[i]; for (Writable wr : values.get()) { IntWritable value = (IntWritable) wr; int hash = hf.hashInt(value.get()).asInt(); if (hash < hashValues[i]) { hashValues[i] = hash; } } } Text sketch = new Text(); Hasher hasher = lsh.newHasher(); int band = 0; for (int i = 0; i < functionsCount; i++) { hasher.putInt(hashValues[i]); if (i > 0 && (i % rows) == 0) { sketch.set(band + "-" + hasher.hash().toString()); write(id, sketch, ctx); hasher = lsh.newHasher(); band++; } } sketch.set(band + "-" + hasher.hash().toString()); write(id, sketch, ctx); }
From source file:mx.iteso.msc.asn.mrwordcount.MyReducer.java
License:Apache License
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0;/* w w w . java 2 s.com*/ for (IntWritable val : values) { sum += val.get(); } context.write(key, new IntWritable(sum)); }