List of usage examples for org.apache.hadoop.io IntWritable toString
@Override
public String toString()
From source file:clustering.tf_idf.TermFreqReducer.java
License:Apache License
/** * @param key group_id/*from www .j av a 2 s . c om*/ * @param values position::term=count * {@inheritDoc} */ @Override protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { int termsCntInDoc = 0; this.termWeightMap.clear(); for (Text val : values) { // positionTermCnt[0] = position // positionTermCnt[1] = term=count String[] positionTermCnt = val.toString().split("::"); String position = positionTermCnt[0]; String[] termCnt = positionTermCnt[1].split("="); int count = Integer.valueOf(termCnt[1]); termsCntInDoc += count; // TODO: 17-4-24 is it necessary to make it enum or a class? double weightedCount = position.equals("title") ? this.weight * count : count; // term : weight CollectionUtils.updateCountMap(this.termWeightMap, termCnt[0], weightedCount); } for (Map.Entry<String, Double> entry : this.termWeightMap.entrySet()) { // term this.outputKey.set(entry.getKey()); // group_id=weighted_tf double wtf = entry.getValue() / termsCntInDoc; this.outputValue.set(key.toString() + "=" + wtf); context.write(this.outputKey, this.outputValue); } }
From source file:co.nubetech.hiho.dedup.HashUtility.java
License:Apache License
public static MD5Hash getMD5Hash(IntWritable key) throws IOException { return MD5Hash.digest(key.toString()); }
From source file:com.digitalpebble.behemoth.mahout.util.ClusterDocIDDumper.java
License:Apache License
public void map(IntWritable key, WeightedVectorWritable value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Vector v = value.getVector(); if (v instanceof NamedVector) { String name = ((NamedVector) v).getName(); if (name != null & name.length() > 2) output.collect(new Text(name), new Text(key.toString())); else/* w w w .jav a2 s. c o m*/ reporter.incrCounter("ClusterDocIDDumper", "Missing name", 1); } else reporter.incrCounter("ClusterDocIDDumper", "Unnamed vector", 1); }
From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * * @param conf the Configuration to use/*from w w w. ja va 2 s .c o m*/ * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param k the number of clusters in Kmeans * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, "random-seeds"); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); System.out.println("****************************************************************************"); log.info("Running KMeans with k = {}", k); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper Path outGlob = new Path(output, "clusters-*-final"); Path clusteredPoints = new Path(output, "clusteredPoints"); log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints); ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints); clusterDumper.printClusters(null); FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"), conf); IntWritable key = new IntWritable(); WeightedVectorWritable value = new WeightedVectorWritable(); while (reader.next(key, value)) { System.out.println(value.toString() + " belongs to cluster " + key.toString()); } reader.close(); }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java
License:Apache License
@Override public void putNext(Tuple t) throws IOException { IntWritable outputKey = new IntWritable(); VectorWritable outputValue = new VectorWritable(); outputKey.set((Integer) t.get(0)); Tuple currRow = (Tuple) t.get(1);//from w ww. j a v a2 s . co m Vector currRowVector; if (dimensions == 0) { throw new IllegalArgumentException("Trying to create 0 dimension vector"); } if (STORE_AS_DENSE) { currRowVector = new NamedVector(new DenseVector(dimensions), outputKey.toString()); } else if (STORE_AS_SEQUENTIAL) { currRowVector = new NamedVector(new SequentialAccessSparseVector(dimensions, currRow.size()), outputKey.toString()); } else { currRowVector = new NamedVector(new RandomAccessSparseVector(dimensions, currRow.size()), outputKey.toString()); } for (int ii = 0; ii < currRow.size(); ii++) { Object o = currRow.get(ii); switch (currRow.getType(ii)) { case DataType.INTEGER: case DataType.LONG: case DataType.FLOAT: case DataType.DOUBLE: currRowVector.set(ii, (Double) o); break; case DataType.TUPLE: // If this is a tuple then we want to set column and element Tuple subt = (Tuple) o; currRowVector.set((Integer) subt.get(0), (Double) subt.get(1)); break; default: throw new RuntimeException("Unexpected tuple form"); } } outputValue.set(currRowVector); try { writer.write(outputKey, outputValue); } catch (InterruptedException e) { LOG.error("Interrupted while writing", e); } }
From source file:edu.dfci.cccb.mev.kmeans.domain.hadoop.HadoopKMeansBuilder.java
License:Open Source License
@Override public KMeans build() throws DatasetException { try (TemporaryFolder hadoop = new TemporaryFolder()) { File points = new File(hadoop, "points"); points.mkdir();//from ww w. j a v a2 s .co m Configuration configuration = new Configuration(); FileSystem system = get(configuration); final Dimension other = dataset().dimension(dimension().type() == ROW ? COLUMN : ROW); List<NamedVector> vectors = new AbstractList<NamedVector>() { @Override public NamedVector get(int index) { final String vector = dimension().keys().get(index); return new NamedVector(new AbstractVector(other.keys().size()) { @Override public void setQuick(int index, double value) { throw new UnsupportedOperationException(); } @Override public Vector like() { return new RandomAccessSparseVector(size()); } @Override public Iterator<Element> iterator() { return new Iterator<Element>() { private int current = 0; @Override public boolean hasNext() { return current < other.keys().size(); } @Override public Element next() { return new Element() { private final int index = current++; @Override public void set(double value) { throw new UnsupportedOperationException(); } @Override public int index() { return index; } @Override @SneakyThrows(InvalidCoordinateException.class) public double get() { return dimension().type() == ROW ? dataset().values().get(vector, other.keys().get(index)) : dataset().values().get(other.keys().get(index), vector); } }; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public Iterator<Element> iterateNonZero() { return iterator(); } @Override public boolean isSequentialAccess() { return true; } @Override public boolean isDense() { return true; } @Override @SneakyThrows(InvalidCoordinateException.class) public double getQuick(int index) { return dimension().type() == ROW ? dataset().values().get(vector, other.keys().get(index)) : dataset().values().get(other.keys().get(index), vector); } @Override public int getNumNondefaultElements() { return other.keys().size(); } @Override protected Matrix matrixLike(int rows, int columns) { throw new UnsupportedOperationException(); } }, vector); } @Override public int size() { return dimension().keys().size(); } }; // write input try (Writer writer = new Writer(system, configuration, new Path(new File(points, "file1").getAbsolutePath()), LongWritable.class, VectorWritable.class)) { VectorWritable writable = new VectorWritable(); long record = 0; for (Vector vector : vectors) { writable.set(vector); writer.append(new LongWritable(record++), writable); } } // prepare clusters File clusters = new File(hadoop, "clusters"); clusters.mkdir(); try (Writer writer = new Writer(system, configuration, new Path(new File(clusters, "part-00000").getAbsolutePath()), Text.class, Cluster.class)) { for (int i = 0; i < k(); i++) { Vector vec = vectors.get(i); Cluster cluster = new Cluster(vec, i, new EuclideanDistanceMeasure()); writer.append(new Text(cluster.getIdentifier()), cluster); } } File output = new File(hadoop, "output"); output.mkdir(); try { run(configuration, new Path(points.getAbsolutePath()), new Path(clusters.getAbsolutePath()), new Path(output.getAbsolutePath()), metric.measurer(), convergence(), iterations(), true, false); try (Reader reader = new Reader(system, new Path( new File(new File(output, CLUSTERED_POINTS_DIR), "/part-m-00000").getAbsolutePath()), configuration)) { IntWritable key = new IntWritable(); WeightedVectorWritable value = new WeightedVectorWritable(); Map<String, Set<String>> result = new HashMap<>(); while (reader.next(key, value)) { Set<String> cluster = result.get(key.toString()); if (cluster == null) result.put(key.toString(), cluster = new HashSet<>()); cluster.add(((NamedVector) value.getVector()).getName()); } return new AbstractKMeans() { }.dataset(dataset()).dimension(dimension()).name(name()).type(type()) .clusters(new HashSet<>(result.values())); } } catch (ClassNotFoundException | InterruptedException e) { throw new DatasetException(e); } } catch (IOException e) { throw new DatasetException(e); } }
From source file:hivemall.ftvec.hashing.ArrayPrefixedHashValuesUDF.java
License:Open Source License
public List<Text> evaluate(List<String> values, String prefix, boolean useIndexAsPrefix) { if (values == null) { return null; }//from w w w.j av a2 s. co m if (prefix == null) { prefix = ""; } List<IntWritable> hashValues = ArrayHashValuesUDF.hashValues(values, null, MurmurHash3.DEFAULT_NUM_FEATURES, useIndexAsPrefix); final int len = hashValues.size(); final Text[] stringValues = new Text[len]; for (int i = 0; i < len; i++) { IntWritable v = hashValues.get(i); stringValues[i] = val(prefix + v.toString()); } return Arrays.asList(stringValues); }
From source file:hk.newsRecommender.MatrixAndCluster.java
License:Open Source License
public static void clusterOutput(Configuration conf, Path path) { try {/*from w w w . j av a2 s . co m*/ BufferedWriter bw; FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = null; reader = new SequenceFile.Reader(fs, path, conf); // ?uidOfgrp.txt?? uid \t groupID bw = new BufferedWriter(new FileWriter(new File("C:\\Users\\Hk\\Desktop\\ClusterPointsInfo.txt"))); HashMap<String, Integer> clusterIds; clusterIds = new HashMap<String, Integer>(120); IntWritable key = new IntWritable(); WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable(); // WeightedVectorWritable value = new WeightedVectorWritable(); while (reader.next(key, value)) { NamedVector vector = (NamedVector) value.getVector(); // VectorName String vectorName = vector.getName(); System.out.println(vectorName + "\t" + key.toString()); bw.write(vectorName + "\t" + key.toString() + "\n"); // ?group? if (clusterIds.containsKey(key.toString())) { clusterIds.put(key.toString(), clusterIds.get(key.toString()) + 1); } else clusterIds.put(key.toString(), 1); } bw.flush(); reader.close(); // ?group?grpSize bw = new BufferedWriter(new FileWriter(new File("C:\\Users\\Hk\\Desktop\\ClusterPointsSize.txt"))); Set<String> keys = clusterIds.keySet(); for (String k : keys) { System.out.println(k + " " + clusterIds.get(k)); bw.write(k + " " + clusterIds.get(k) + "\n"); } bw.flush(); bw.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:hk.newsRecommender.MatrixAndCluster.java
License:Open Source License
public static void clusterOutput2(Configuration conf, Path path) { try {/* ww w . j a va 2 s . c o m*/ FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); IntWritable key = new IntWritable(); WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable(); while (reader.next(key, value)) { System.out.println(value.toString() + " belongs to cluster " + key.toString()); } reader.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:hr.fer.tel.rovkp.homework02.task02.LocationsReducer.java
@Override public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { DebsRecordParser parser = new DebsRecordParser(); boolean passed = false; for (Text value : values) { if (!passed) { try { parser.parse(value.toString()); passed = true;/* w w w . j a v a 2 s.c o m*/ } catch (ParseException ex) { passed = false; } } mos.write("bins", NullWritable.get(), value, parser.getLocation() + key.toString() + "/part"); } }