Example usage for org.apache.hadoop.io IntWritable toString

List of usage examples for org.apache.hadoop.io IntWritable toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io IntWritable toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:clustering.tf_idf.TermFreqReducer.java

License:Apache License

/**
 * @param key    group_id/*from www .j av  a  2 s .  c om*/
 * @param values position::term=count
 *               {@inheritDoc}
 */
@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    int termsCntInDoc = 0;
    this.termWeightMap.clear();

    for (Text val : values) {
        // positionTermCnt[0] = position
        // positionTermCnt[1] = term=count
        String[] positionTermCnt = val.toString().split("::");
        String position = positionTermCnt[0];

        String[] termCnt = positionTermCnt[1].split("=");

        int count = Integer.valueOf(termCnt[1]);
        termsCntInDoc += count;
        // TODO: 17-4-24 is it necessary to make it enum or a class?
        double weightedCount = position.equals("title") ? this.weight * count : count;

        // term : weight
        CollectionUtils.updateCountMap(this.termWeightMap, termCnt[0], weightedCount);
    }

    for (Map.Entry<String, Double> entry : this.termWeightMap.entrySet()) {
        // term
        this.outputKey.set(entry.getKey());
        // group_id=weighted_tf
        double wtf = entry.getValue() / termsCntInDoc;
        this.outputValue.set(key.toString() + "=" + wtf);
        context.write(this.outputKey, this.outputValue);
    }
}

From source file:co.nubetech.hiho.dedup.HashUtility.java

License:Apache License

public static MD5Hash getMD5Hash(IntWritable key) throws IOException {
    return MD5Hash.digest(key.toString());
}

From source file:com.digitalpebble.behemoth.mahout.util.ClusterDocIDDumper.java

License:Apache License

public void map(IntWritable key, WeightedVectorWritable value, OutputCollector<Text, Text> output,
        Reporter reporter) throws IOException {
    Vector v = value.getVector();
    if (v instanceof NamedVector) {
        String name = ((NamedVector) v).getName();
        if (name != null & name.length() > 2)
            output.collect(new Text(name), new Text(key.toString()));
        else/*  w  w  w  .jav  a2 s. c o  m*/
            reporter.incrCounter("ClusterDocIDDumper", "Missing name", 1);
    } else
        reporter.incrCounter("ClusterDocIDDumper", "Unnamed vector", 1);
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 *
 * @param conf the Configuration to use/*from   w  w w. ja va  2  s .c  o  m*/
 * @param input the String denoting the input directory path
 * @param output the String denoting the output directory path
 * @param measure the DistanceMeasure to use
 * @param k the number of clusters in Kmeans
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, "random-seeds");
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    System.out.println("****************************************************************************");

    log.info("Running KMeans with k = {}", k);
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    Path outGlob = new Path(output, "clusters-*-final");
    Path clusteredPoints = new Path(output, "clusteredPoints");
    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
    clusterDumper.printClusters(null);

    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs,
            new Path("output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"), conf);
    IntWritable key = new IntWritable();
    WeightedVectorWritable value = new WeightedVectorWritable();
    while (reader.next(key, value)) {
        System.out.println(value.toString() + " belongs to cluster " + key.toString());
    }
    reader.close();
}

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java

License:Apache License

@Override
public void putNext(Tuple t) throws IOException {
    IntWritable outputKey = new IntWritable();
    VectorWritable outputValue = new VectorWritable();
    outputKey.set((Integer) t.get(0));
    Tuple currRow = (Tuple) t.get(1);//from w ww.  j  a v  a2  s  .  co  m
    Vector currRowVector;
    if (dimensions == 0) {
        throw new IllegalArgumentException("Trying to create 0 dimension vector");
    }
    if (STORE_AS_DENSE) {
        currRowVector = new NamedVector(new DenseVector(dimensions), outputKey.toString());
    } else if (STORE_AS_SEQUENTIAL) {
        currRowVector = new NamedVector(new SequentialAccessSparseVector(dimensions, currRow.size()),
                outputKey.toString());
    } else {
        currRowVector = new NamedVector(new RandomAccessSparseVector(dimensions, currRow.size()),
                outputKey.toString());
    }
    for (int ii = 0; ii < currRow.size(); ii++) {
        Object o = currRow.get(ii);
        switch (currRow.getType(ii)) {
        case DataType.INTEGER:
        case DataType.LONG:
        case DataType.FLOAT:
        case DataType.DOUBLE:
            currRowVector.set(ii, (Double) o);
            break;
        case DataType.TUPLE:
            // If this is a tuple then we want to set column and element
            Tuple subt = (Tuple) o;
            currRowVector.set((Integer) subt.get(0), (Double) subt.get(1));
            break;
        default:
            throw new RuntimeException("Unexpected tuple form");
        }
    }
    outputValue.set(currRowVector);
    try {
        writer.write(outputKey, outputValue);
    } catch (InterruptedException e) {
        LOG.error("Interrupted while writing", e);
    }
}

From source file:edu.dfci.cccb.mev.kmeans.domain.hadoop.HadoopKMeansBuilder.java

License:Open Source License

@Override
public KMeans build() throws DatasetException {
    try (TemporaryFolder hadoop = new TemporaryFolder()) {
        File points = new File(hadoop, "points");
        points.mkdir();//from  ww w.  j a v  a2  s  .co  m

        Configuration configuration = new Configuration();
        FileSystem system = get(configuration);
        final Dimension other = dataset().dimension(dimension().type() == ROW ? COLUMN : ROW);

        List<NamedVector> vectors = new AbstractList<NamedVector>() {

            @Override
            public NamedVector get(int index) {
                final String vector = dimension().keys().get(index);
                return new NamedVector(new AbstractVector(other.keys().size()) {

                    @Override
                    public void setQuick(int index, double value) {
                        throw new UnsupportedOperationException();
                    }

                    @Override
                    public Vector like() {
                        return new RandomAccessSparseVector(size());
                    }

                    @Override
                    public Iterator<Element> iterator() {
                        return new Iterator<Element>() {
                            private int current = 0;

                            @Override
                            public boolean hasNext() {
                                return current < other.keys().size();
                            }

                            @Override
                            public Element next() {
                                return new Element() {
                                    private final int index = current++;

                                    @Override
                                    public void set(double value) {
                                        throw new UnsupportedOperationException();
                                    }

                                    @Override
                                    public int index() {
                                        return index;
                                    }

                                    @Override
                                    @SneakyThrows(InvalidCoordinateException.class)
                                    public double get() {
                                        return dimension().type() == ROW
                                                ? dataset().values().get(vector, other.keys().get(index))
                                                : dataset().values().get(other.keys().get(index), vector);
                                    }
                                };
                            }

                            @Override
                            public void remove() {
                                throw new UnsupportedOperationException();
                            }
                        };
                    }

                    @Override
                    public Iterator<Element> iterateNonZero() {
                        return iterator();
                    }

                    @Override
                    public boolean isSequentialAccess() {
                        return true;
                    }

                    @Override
                    public boolean isDense() {
                        return true;
                    }

                    @Override
                    @SneakyThrows(InvalidCoordinateException.class)
                    public double getQuick(int index) {
                        return dimension().type() == ROW
                                ? dataset().values().get(vector, other.keys().get(index))
                                : dataset().values().get(other.keys().get(index), vector);
                    }

                    @Override
                    public int getNumNondefaultElements() {
                        return other.keys().size();
                    }

                    @Override
                    protected Matrix matrixLike(int rows, int columns) {
                        throw new UnsupportedOperationException();
                    }
                }, vector);
            }

            @Override
            public int size() {
                return dimension().keys().size();
            }
        };

        // write input
        try (Writer writer = new Writer(system, configuration,
                new Path(new File(points, "file1").getAbsolutePath()), LongWritable.class,
                VectorWritable.class)) {
            VectorWritable writable = new VectorWritable();
            long record = 0;
            for (Vector vector : vectors) {
                writable.set(vector);
                writer.append(new LongWritable(record++), writable);
            }
        }

        // prepare clusters
        File clusters = new File(hadoop, "clusters");
        clusters.mkdir();
        try (Writer writer = new Writer(system, configuration,
                new Path(new File(clusters, "part-00000").getAbsolutePath()), Text.class, Cluster.class)) {
            for (int i = 0; i < k(); i++) {
                Vector vec = vectors.get(i);
                Cluster cluster = new Cluster(vec, i, new EuclideanDistanceMeasure());
                writer.append(new Text(cluster.getIdentifier()), cluster);
            }
        }

        File output = new File(hadoop, "output");
        output.mkdir();

        try {
            run(configuration, new Path(points.getAbsolutePath()), new Path(clusters.getAbsolutePath()),
                    new Path(output.getAbsolutePath()), metric.measurer(), convergence(), iterations(), true,
                    false);

            try (Reader reader = new Reader(system, new Path(
                    new File(new File(output, CLUSTERED_POINTS_DIR), "/part-m-00000").getAbsolutePath()),
                    configuration)) {
                IntWritable key = new IntWritable();
                WeightedVectorWritable value = new WeightedVectorWritable();
                Map<String, Set<String>> result = new HashMap<>();

                while (reader.next(key, value)) {
                    Set<String> cluster = result.get(key.toString());
                    if (cluster == null)
                        result.put(key.toString(), cluster = new HashSet<>());
                    cluster.add(((NamedVector) value.getVector()).getName());
                }

                return new AbstractKMeans() {
                }.dataset(dataset()).dimension(dimension()).name(name()).type(type())
                        .clusters(new HashSet<>(result.values()));
            }
        } catch (ClassNotFoundException | InterruptedException e) {
            throw new DatasetException(e);
        }
    } catch (IOException e) {
        throw new DatasetException(e);
    }
}

From source file:hivemall.ftvec.hashing.ArrayPrefixedHashValuesUDF.java

License:Open Source License

public List<Text> evaluate(List<String> values, String prefix, boolean useIndexAsPrefix) {
    if (values == null) {
        return null;
    }//from   w w w.j av a2  s.  co  m
    if (prefix == null) {
        prefix = "";
    }

    List<IntWritable> hashValues = ArrayHashValuesUDF.hashValues(values, null, MurmurHash3.DEFAULT_NUM_FEATURES,
            useIndexAsPrefix);
    final int len = hashValues.size();
    final Text[] stringValues = new Text[len];
    for (int i = 0; i < len; i++) {
        IntWritable v = hashValues.get(i);
        stringValues[i] = val(prefix + v.toString());
    }
    return Arrays.asList(stringValues);
}

From source file:hk.newsRecommender.MatrixAndCluster.java

License:Open Source License

public static void clusterOutput(Configuration conf, Path path) {
    try {/*from  w w  w . j av  a2  s  .  co m*/
        BufferedWriter bw;
        FileSystem fs = FileSystem.get(conf);

        SequenceFile.Reader reader = null;
        reader = new SequenceFile.Reader(fs, path, conf);

        // ?uidOfgrp.txt?? uid \t groupID
        bw = new BufferedWriter(new FileWriter(new File("C:\\Users\\Hk\\Desktop\\ClusterPointsInfo.txt")));
        HashMap<String, Integer> clusterIds;
        clusterIds = new HashMap<String, Integer>(120);
        IntWritable key = new IntWritable();
        WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable();
        //         WeightedVectorWritable value = new WeightedVectorWritable();
        while (reader.next(key, value)) {
            NamedVector vector = (NamedVector) value.getVector();
            // VectorName
            String vectorName = vector.getName();
            System.out.println(vectorName + "\t" + key.toString());
            bw.write(vectorName + "\t" + key.toString() + "\n");
            // ?group?
            if (clusterIds.containsKey(key.toString())) {
                clusterIds.put(key.toString(), clusterIds.get(key.toString()) + 1);
            } else
                clusterIds.put(key.toString(), 1);
        }
        bw.flush();
        reader.close();
        // ?group?grpSize
        bw = new BufferedWriter(new FileWriter(new File("C:\\Users\\Hk\\Desktop\\ClusterPointsSize.txt")));
        Set<String> keys = clusterIds.keySet();
        for (String k : keys) {
            System.out.println(k + " " + clusterIds.get(k));
            bw.write(k + " " + clusterIds.get(k) + "\n");
        }
        bw.flush();
        bw.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:hk.newsRecommender.MatrixAndCluster.java

License:Open Source License

public static void clusterOutput2(Configuration conf, Path path) {
    try {/* ww w  .  j  a  va  2  s  . c o  m*/
        FileSystem fs = FileSystem.get(conf);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        IntWritable key = new IntWritable();
        WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable();
        while (reader.next(key, value)) {
            System.out.println(value.toString() + " belongs to cluster " + key.toString());
        }
        reader.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:hr.fer.tel.rovkp.homework02.task02.LocationsReducer.java

@Override
public void reduce(IntWritable key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {

    DebsRecordParser parser = new DebsRecordParser();

    boolean passed = false;

    for (Text value : values) {
        if (!passed) {
            try {
                parser.parse(value.toString());
                passed = true;/*  w w w  . j  a v a  2  s.c o m*/
            } catch (ParseException ex) {
                passed = false;
            }
        }
        mos.write("bins", NullWritable.get(), value, parser.getLocation() + key.toString() + "/part");
    }
}