Example usage for org.apache.hadoop.io IntWritable get

List of usage examples for org.apache.hadoop.io IntWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io IntWritable get.

Prototype

public int get() 

Source Link

Document

Return the value of this IntWritable.

Usage

From source file:ml.shifu.shifu.core.binning.UpdateBinningInfoReducer.java

License:Apache License

@Override
protected void reduce(IntWritable key, Iterable<BinningInfoWritable> values, Context context)
        throws IOException, InterruptedException {
    long start = System.currentTimeMillis();
    double sum = 0d;
    double squaredSum = 0d;
    double tripleSum = 0d;
    double quarticSum = 0d;

    long count = 0L, missingCount = 0L;
    double min = Double.MAX_VALUE, max = Double.MIN_VALUE;
    List<Double> binBoundaryList = null;
    List<String> binCategories = null;
    long[] binCountPos = null;
    long[] binCountNeg = null;
    double[] binWeightPos = null;
    double[] binWeightNeg = null;

    ColumnConfig columnConfig = this.columnConfigList.get(key.get());

    int binSize = 0;
    for (BinningInfoWritable info : values) {
        if (info.isNumeric() && binBoundaryList == null) {
            binBoundaryList = info.getBinBoundaries();
            binSize = binBoundaryList.size();
            binCountPos = new long[binSize + 1];
            binCountNeg = new long[binSize + 1];
            binWeightPos = new double[binSize + 1];
            binWeightNeg = new double[binSize + 1];
        }//w w  w .  ja  v a  2 s. c o  m
        if (!info.isNumeric() && binCategories == null) {
            binCategories = info.getBinCategories();
            binSize = binCategories.size();
            binCountPos = new long[binSize + 1];
            binCountNeg = new long[binSize + 1];
            binWeightPos = new double[binSize + 1];
            binWeightNeg = new double[binSize + 1];
        }
        count += info.getTotalCount();
        missingCount += info.getMissingCount();
        // for numeric, such sums are OK, for categorical, such values are all 0, should be updated by using
        // binCountPos and binCountNeg
        sum += info.getSum();
        squaredSum += info.getSquaredSum();
        tripleSum += info.getTripleSum();
        quarticSum += info.getQuarticSum();
        if (Double.compare(max, info.getMax()) < 0) {
            max = info.getMax();
        }

        if (Double.compare(min, info.getMin()) > 0) {
            min = info.getMin();
        }

        for (int i = 0; i < (binSize + 1); i++) {
            binCountPos[i] += info.getBinCountPos()[i];
            binCountNeg[i] += info.getBinCountNeg()[i];
            binWeightPos[i] += info.getBinWeightPos()[i];
            binWeightNeg[i] += info.getBinWeightNeg()[i];
        }
    }

    double[] binPosRate = computePosRate(binCountPos, binCountNeg);

    String binBounString = null;
    if (columnConfig.isCategorical()) {
        if (binCategories.size() == 0 || binCategories.size() > MAX_CATEGORICAL_BINC_COUNT) {
            LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(),
                    binCategories.size());
            return;
        }
        binBounString = Base64Utils.base64Encode(
                "[" + StringUtils.join(binCategories, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]");
        // recompute such value for categorical variables
        min = Double.MAX_VALUE;
        max = Double.MIN_VALUE;
        sum = 0d;
        squaredSum = 0d;
        for (int i = 0; i < binPosRate.length; i++) {
            if (!Double.isNaN(binPosRate[i])) {
                if (Double.compare(max, binPosRate[i]) < 0) {
                    max = binPosRate[i];
                }

                if (Double.compare(min, binPosRate[i]) > 0) {
                    min = binPosRate[i];
                }
                long binCount = binCountPos[i] + binCountNeg[i];
                sum += binPosRate[i] * binCount;
                double squaredVal = binPosRate[i] * binPosRate[i];
                squaredSum += squaredVal * binCount;
                tripleSum += squaredVal * binPosRate[i] * binCount;
                quarticSum += squaredVal * squaredVal * binCount;
            }
        }
    } else {
        if (binBoundaryList.size() == 0) {
            LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(),
                    binBoundaryList.size());
            return;
        }
        binBounString = binBoundaryList.toString();
    }

    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binCountNeg, binCountPos);
    ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(binWeightNeg,
            binWeightPos);

    // TODO & FIXME do we need validCount(totalCount - missingValueCount) for mean and stddev???
    double mean = sum / count;
    double stdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / count + EPS) / (count - 1)));
    double aStdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / count + EPS) / count));

    double skewness = ColumnStatsCalculator.computeSkewness(count, mean, aStdDev, sum, squaredSum, tripleSum);
    double kurtosis = ColumnStatsCalculator.computeKurtosis(count, mean, aStdDev, sum, squaredSum, tripleSum,
            quarticSum);

    sb.append(key.get()).append(Constants.DEFAULT_DELIMITER).append(binBounString)
            .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountNeg))
            .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountPos))
            .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(new double[0]))
            .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binPosRate))
            .append(Constants.DEFAULT_DELIMITER).append(df.format(columnCountMetrics.getKs()))
            .append(Constants.DEFAULT_DELIMITER).append(df.format(columnWeightMetrics.getIv()))
            .append(Constants.DEFAULT_DELIMITER).append(df.format(max)).append(Constants.DEFAULT_DELIMITER)
            .append(df.format(min)).append(Constants.DEFAULT_DELIMITER).append(df.format(mean))
            .append(Constants.DEFAULT_DELIMITER).append(df.format(stdDev)).append(Constants.DEFAULT_DELIMITER)
            .append(columnConfig.isCategorical() ? "C" : "N").append(Constants.DEFAULT_DELIMITER)
            .append(df.format(mean)).append(Constants.DEFAULT_DELIMITER).append(missingCount)
            .append(Constants.DEFAULT_DELIMITER).append(count).append(Constants.DEFAULT_DELIMITER)
            .append(missingCount * 1.0d / count).append(Constants.DEFAULT_DELIMITER)
            .append(Arrays.toString(binWeightNeg)).append(Constants.DEFAULT_DELIMITER)
            .append(Arrays.toString(binWeightPos)).append(Constants.DEFAULT_DELIMITER)
            .append(columnCountMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER)
            .append(columnWeightMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER)
            .append(columnWeightMetrics.getKs()).append(Constants.DEFAULT_DELIMITER)
            .append(columnCountMetrics.getIv()).append(Constants.DEFAULT_DELIMITER)
            .append(columnCountMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER)
            .append(columnWeightMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER)
            .append(skewness).append(Constants.DEFAULT_DELIMITER).append(kurtosis);

    outputValue.set(sb.toString());
    context.write(NullWritable.get(), outputValue);
    sb.delete(0, sb.length());
    LOG.debug("Time:{}", (System.currentTimeMillis() - start));
}

From source file:ml.shifu.shifu.core.correlation.CorrelationReducer.java

License:Apache License

@Override
protected void reduce(IntWritable key, Iterable<CorrelationWritable> values, Context context)
        throws IOException, InterruptedException {
    // build final correlation column info
    CorrelationWritable finalCw = null;/*from ww w.  ja  v a2s .  com*/

    Iterator<CorrelationWritable> cwIt = values.iterator();
    while (cwIt.hasNext()) {
        CorrelationWritable cw = cwIt.next();

        if (!cw.isValid()) {
            // In this case, there is no need to process this one because all of inside value is null
            LOG.warn("Such CorrelationWritable has not been inited, so we ingore it");
            continue;
        }

        if (finalCw == null) {
            finalCw = initCw(cw.getAdjustCount().length);
        }
        finalCw.setColumnIndex(cw.getColumnIndex());
        finalCw.combine(cw);
    }

    if (finalCw == null) {
        LOG.warn("Key: {}, Reducer result is null because there is no useful correlationwritable from Mapper.",
                key.get());
        return;
    }

    this.outputKey.set(key.get());
    this.outputValue.set(new String(Base64.encodeBase64(objectToBytes(finalCw)), "utf-8"));
    context.write(outputKey, outputValue);
}

From source file:ml.shifu.shifu.core.posttrain.FeatureImportanceReducer.java

License:Apache License

@Override
protected void reduce(IntWritable key, Iterable<DoubleWritable> values, Context context)
        throws IOException, InterruptedException {
    double sum = 0d;
    for (DoubleWritable dw : values) {
        sum += dw.get();// w w  w  . j a  v a 2 s  . c  o  m
    }
    this.variableStatsMap.put(key.get(), sum);
}

From source file:mlbench.bayes.BayesUtils.java

License:Apache License

static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();
    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);/*from ww w  . j  a v  a2 s.  c o  m*/
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    try {
        long currentChunkSize = 0;
        long featureCount = 0;
        long vectorCount = Long.MAX_VALUE;
        Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
        for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>(
                filesPattern, PathType.GLOB, null, null, true, conf)) {

            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(freqWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            IntWritable key = record.getFirst();
            LongWritable value = record.getSecond();
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
        featureCount++;
        Long[] counts = { featureCount, vectorCount };
        return new Pair<Long[], List<Path>>(counts, chunkPaths);
    } finally {
        Closeables.close(freqWriter, false);
    }
}

From source file:mlbench.bayes.train.WeightSummer.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
    parseArgs(args);//from w ww .j a  v  a  2 s .  co  m
    HashMap<String, String> conf = new HashMap<String, String>();
    initConf(conf);
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {

        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                (JobConf) config, inDir, rank);
        Vector weightsPerFeature = null;
        Vector weightsPerLabel = new DenseVector(labNum);

        for (int i = 0; i < inputs.length; i++) {
            FileSplit fsplit = inputs[i];
            SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
                    fsplit);
            IntWritable index = kvrr.createKey();
            VectorWritable value = kvrr.createValue();
            while (kvrr.next(index, value)) {
                Vector instance = value.get();
                if (weightsPerFeature == null) {
                    weightsPerFeature = new RandomAccessSparseVector(instance.size(),
                            instance.getNumNondefaultElements());
                }

                int label = index.get();
                weightsPerFeature.assign(instance, Functions.PLUS);
                weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum());
            }
        }
        if (weightsPerFeature != null) {
            MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature));
            MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel));
        }
    } else if (MPI_D.COMM_BIPARTITE_A != null) {
        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
        config.set(MAPRED_OUTPUT_DIR, outDirW);
        config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
        ((JobConf) config).setOutputKeyClass(Text.class);
        ((JobConf) config).setOutputValueClass(VectorWritable.class);
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
                DataMPIUtil.getHadoopTaskAttemptID());
        SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>();
        FileSystem fs = FileSystem.get(config);

        Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
        FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
        RecordWriter<Text, VectorWritable> outrw = null;
        try {
            fcommitter.setupJob(taskContext);
            outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("ERROR: Please set the HDFS configuration properly\n");
            System.exit(-1);
        }

        Text key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        Vector vector = null;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (Text) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
            } else if (!key.equals(newKey)) {
                outrw.write(key, new VectorWritable(vector));
                vector = null;
            }
            if (vector == null) {
                vector = newPoint.get();
            } else {
                vector.assign(newPoint.get(), Functions.PLUS);
            }

            key = newKey;
            point = newPoint;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            outrw.write(key, new VectorWritable(vector));
        }

        outrw.close(null);
        if (fcommitter.needsTaskCommit(taskContext)) {
            fcommitter.commitTask(taskContext);
        }

        MPI_D.COMM_BIPARTITE_A.Barrier();
        if (rank == 0) {
            Path resOut = new Path(outDir);
            NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config);
            naiveBayesModel.serialize(resOut, config);
        }
    }

    MPI_D.Finalize();
}

From source file:mlbench.kmeans.KmeansInit.java

License:Apache License

/**
 * get the input values and choose the K clusters' centers
 *
 * @param dataPath/*w w  w  . java2  s  .  c o m*/
 * @throws MPI_D_Exception
 * @throws IOException
 * @throws MPIException
 */
@SuppressWarnings("deprecation")
private static void init(String args[], String dataPath, int kCluster, HashMap<String, String> conf)
        throws MPI_D_Exception, IOException, MPIException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);

            Random random = new Random(1000);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            IntWritable cluster = new IntWritable();
            while (reader.next(k, v)) {
                cluster.set(random.nextInt(kCluster));
                MPI_D.Send(cluster, v);
            }
            reader.close();
        }
    } else {
        IntWritable key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        double sum[] = null;
        int count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
                sum = new double[newPoint.get().size()];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = sum[i] / count;
                }
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                centers.add(oneCenter);
                sum = new double[point.get().size()];
                count = 0;
            }
            key = newKey;
            point = newPoint;
            KmeansUtils.accumulate(sum, newPoint.get());
            count++;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            }
            PointVector oneCenter = new PointVector(key.get(), centerVals);
            centers.add(oneCenter);
        }

        transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.gatherCentersByP2P(centers);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());
            }
            resOut.flush();
            resOut.close();
        }

        System.out.println("rank " + rank + " finish");
    }
    MPI_D.Finalize();
}

From source file:mlbench.kmeans.KmeansIter.java

License:Apache License

/**
 * Calculate the new center iteratively// ww w.j  a  va  2  s. c  o m
 *
 * @return true: finish; false: continue
 * @throws MPI_D_Exception
 * @throws MPIException
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private static void iterBody(String args[], HashMap<String, String> conf)
        throws MPI_D_Exception, MPIException, IOException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);

    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);

        if (rank == 0) {
            System.out.println(centerPath);
            DataInputStream in = KmeansUtils.readFromHDFSF(new Path(centerPath), config);

            String lineVal;
            try {
                while ((lineVal = in.readLine()) != null) {
                    String lineSeq[] = lineVal.split(":");
                    PointVector p = new PointVector(Integer.valueOf(lineSeq[0]), format(lineSeq[1]));
                    centers.add(p);
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    in.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        MPI_D.COMM_BIPARTITE_O.Barrier();

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.broadcastCenters(centers);

        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);
        double centerSum[][] = new double[kCluster][];
        long centerPNum[] = new long[kCluster];

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            while (reader.next(k, v)) {
                int centerBelong = (int) getBelongPoint(v);
                //                    int i = (int) p.getStrClusterClass();
                //                    double[] vals = p.getDoubleValue();
                int len = v.get().size();
                if (centerSum[centerBelong] == null) {
                    centerSum[centerBelong] = new double[len];
                }
                for (int j = 0; j < len; j++) {
                    centerSum[centerBelong][j] += v.get().get(j);
                }
                centerPNum[centerBelong]++;
            }
            reader.close();
        }

        for (int i = 0; i < centerPNum.length; i++) {
            if (centerSum[i] == null && centerPNum[i] == 0) {
                continue;
            }
            MPI_D.Send(new IntWritable(i), new KmeansCenters(centerPNum[i], centerSum[i]));
        }
    } else {
        centers.clear();
        IntWritable key = null, newKey = null;
        KmeansCenters value = null, newValue = null;
        double sum[] = null;
        long count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newValue = (KmeansCenters) vals[1];
            if (key == null && value == null) {
                sum = new double[newValue.getVector().length];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = (double) sum[i] / count;
                }
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                centers.add(oneCenter);
                sum = new double[value.getVector().length];
                count = 0;
            }
            key = newKey;
            value = newValue;
            KmeansUtils.accumulate(sum, newValue.getVector());
            count += Long.valueOf(newValue.getPointSize());
            vals = MPI_D.Recv();
        }
        if (newKey != null && newValue != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            }
            PointVector oneCenter = new PointVector(key.get(), centerVals);
            centers.add(oneCenter);
        }

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.gatherCentersByP2P(centers);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());
            }
            resOut.flush();
            resOut.close();
        }
    }
    MPI_D.Finalize();
}

From source file:mx.itam.metodos.lshclustering.MinhashEmitMapper.java

License:Apache License

@Override
public void map(Text id, IntArrayWritable values, Context context) throws IOException, InterruptedException {
    for (int i = 0; i < functionsCount; i++) {
        hashValues[i] = Integer.MAX_VALUE;
    }//  w ww .  j a v  a2  s .c  o m
    for (int i = 0; i < functionsCount; i++) {
        HashFunction hf = functions[i];
        for (Writable wr : values.get()) {
            IntWritable value = (IntWritable) wr;
            int hash = hf.hashInt(value.get()).asInt();
            if (hash < hashValues[i]) {
                hashValues[i] = hash;
            }
        }
    }
    Text sketch = new Text();
    Hasher hasher = lsh.newHasher();
    int band = 0;
    for (int i = 0; i < functionsCount; i++) {
        hasher.putInt(hashValues[i]);
        if (i > 0 && (i % rows) == 0) {
            sketch.set(band + "-" + hasher.hash().toString());
            context.write(new SecondarySortKey(sketch, id), id);
            hasher = lsh.newHasher();
            band++;
        }
    }
    sketch.set(band + "-" + hasher.hash().toString());
    context.write(new SecondarySortKey(sketch, id), id);
}

From source file:mx.itam.metodos.minhashing.MinhashMapper.java

License:Apache License

@Override
public void map(Text id, IntArrayWritable values, Context ctx) throws IOException, InterruptedException {
    for (int i = 0; i < functionsCount; i++) {
        hashValues[i] = Integer.MAX_VALUE;
    }/*  w w  w .j av  a 2 s.  c o  m*/
    for (int i = 0; i < functionsCount; i++) {
        HashFunction hf = functions[i];
        for (Writable wr : values.get()) {
            IntWritable value = (IntWritable) wr;
            int hash = hf.hashInt(value.get()).asInt();
            if (hash < hashValues[i]) {
                hashValues[i] = hash;
            }
        }
    }
    Text sketch = new Text();
    Hasher hasher = lsh.newHasher();
    int band = 0;
    for (int i = 0; i < functionsCount; i++) {
        hasher.putInt(hashValues[i]);
        if (i > 0 && (i % rows) == 0) {
            sketch.set(band + "-" + hasher.hash().toString());
            write(id, sketch, ctx);
            hasher = lsh.newHasher();
            band++;
        }
    }
    sketch.set(band + "-" + hasher.hash().toString());
    write(id, sketch, ctx);
}

From source file:mx.iteso.msc.asn.mrwordcount.MyReducer.java

License:Apache License

protected void reduce(Text key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {
    int sum = 0;/* w w w  .  java 2 s.com*/

    for (IntWritable val : values) {
        sum += val.get();
    }
    context.write(key, new IntWritable(sum));
}