List of usage examples for org.apache.mahout.math RandomAccessSparseVector setQuick
@Override public void setQuick(int index, double value)
From source file:com.scaleunlimited.classify.vectors.SetNormalizerTest.java
License:Apache License
@Test public void testNormalization() { BaseNormalizer normalizer = new SetNormalizer(); RandomAccessSparseVector v = new RandomAccessSparseVector(3); v.setQuick(0, 2.0); v.setQuick(1, 8.0);// w w w.j a va 2s. c om v.setQuick(2, 0.0); normalizer.normalize(v); assertEquals(0.5, v.get(0), 0.001); assertEquals(0.5, v.get(1), 0.001); assertEquals(0.0, v.get(2), 0.001); v = new RandomAccessSparseVector(3); v.setQuick(0, 4.0); v.setQuick(1, 0.0); v.setQuick(2, 0.0); normalizer.normalize(v); assertEquals(1.0, v.get(0), 0.001); assertEquals(0.0, v.get(1), 0.001); assertEquals(0.0, v.get(2), 0.001); }
From source file:com.scaleunlimited.classify.vectors.TfNormalizerTest.java
License:Apache License
@Test public void testNormalization() { BaseNormalizer normalizer = new TfNormalizer(); RandomAccessSparseVector v = new RandomAccessSparseVector(3); v.setQuick(0, 2.0); v.setQuick(1, 8.0);/*from w ww . j av a 2 s.c om*/ v.setQuick(2, 0.0); normalizer.normalize(v); assertEquals(0.2, v.get(0), 0.001); assertEquals(0.8, v.get(1), 0.001); assertEquals(0.0, v.get(2), 0.001); v = new RandomAccessSparseVector(3); v.setQuick(0, 4.0); v.setQuick(1, 0.0); v.setQuick(2, 0.0); normalizer.normalize(v); assertEquals(1.0, v.get(0), 0.001); assertEquals(0.0, v.get(1), 0.001); assertEquals(0.0, v.get(2), 0.001); }
From source file:com.scaleunlimited.classify.vectors.UnitNormalizerTest.java
License:Apache License
@Test public void testNormalization() { BaseNormalizer normalizer = new UnitNormalizer(); RandomAccessSparseVector v = new RandomAccessSparseVector(3); v.setQuick(0, 2.0); v.setQuick(1, 8.0);//w ww. j av a2 s . co m v.setQuick(2, 0.0); normalizer.normalize(v); assertEquals(1.0, v.getLengthSquared(), 0.001); v = new RandomAccessSparseVector(3); v.setQuick(0, 4.0); v.setQuick(1, 0.0); v.setQuick(2, 0.0); normalizer.normalize(v); assertEquals(1.0, v.getLengthSquared(), 0.001); }
From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java
License:Open Source License
public void vectorize(File luceneIndexDir, File outputDir) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); SequenceFile.Writer writer = null; FeatureDictionary dict = new FeatureDictionary(); DirectoryReader reader = null;//from ww w . j a va 2 s. c om try { reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir)); writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"), IDAndCodes.class, VectorWritable.class); IDAndCodes idAndCodes = new IDAndCodes(); VectorWritable vectorWritable = new VectorWritable(); Fields fields = MultiFields.getFields(reader); if (fields != null) { Iterator<String> fieldNames = fields.iterator(); while (fieldNames.hasNext()) { String field = fieldNames.next(); if (!field.startsWith("bip:") && !"itemID".equals(field)) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { dict.addTextFeature(field, text.utf8ToString()); } } } } int numDocsVectorized = 0; for (int docID = 0; docID < reader.maxDoc(); docID++) { Document doc = reader.document(docID); int itemID = doc.getField("itemID").numericValue().intValue(); RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures()); Multimap<String, String> codes = HashMultimap.create(); for (IndexableField field : doc.getFields()) { String fieldName = field.name(); if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) { Terms termFreqVector = reader.getTermVector(docID, fieldName); if (termFreqVector != null) { int maxTermFrequency = maxTermFrequency(termFreqVector); TermsEnum te = termFreqVector.iterator(null); BytesRef term; while ((term = te.next()) != null) { String termStr = term.utf8ToString(); int termFrequency = (int) te.totalTermFreq(); int documentFrequency = reader.docFreq(new Term(fieldName, term)); int numDocs = reader.numDocs(); double weight = weighting.weight(fieldName, termStr, termFrequency, documentFrequency, maxTermFrequency, numDocs); int featureIndex = dict.index(fieldName, term.utf8ToString()); documentVector.setQuick(featureIndex, weight); } } } else if (fieldName.startsWith("bip:")) { for (String value : doc.getValues(fieldName)) { codes.put(fieldName, value); } } } Vector featureVector = new SequentialAccessSparseVector(documentVector); weighting.normalize(featureVector); idAndCodes.set(itemID, codes); vectorWritable.set(featureVector); writer.append(idAndCodes, vectorWritable); numDocsVectorized++; if (numDocsVectorized % 100 == 0) { log.info("Vectorized {} documents", numDocsVectorized); } } log.info("Vectorized {} documents", numDocsVectorized); dict.writeToFile(new File(outputDir, "features.txt")); log.info("Wrote feature dictionary"); } finally { Closeables.close(reader, true); Closeables.close(writer, true); } }
From source file:org.swjtu.helloworldcn.APCMatrixInputReducer.java
License:Apache License
@Override protected void reduce(IntWritable row, Iterable<APCMatrixEntryWritable> values, Context context) throws IOException, InterruptedException { int size = context.getConfiguration().getInt(APCMatrixInputJob.MATRIX_DIMENSIONS, Integer.MAX_VALUE); RandomAccessSparseVector outS = new RandomAccessSparseVector(size, 100); RandomAccessSparseVector outA = new RandomAccessSparseVector(size, 100); RandomAccessSparseVector outR = new RandomAccessSparseVector(size, 100); Configuration conf = context.getConfiguration(); if (preference == null) { preference = Double.parseDouble(conf.get(APCMatrixInputJob.TEMPORARY_SAVE_PREFERENCE)); }/*from w w w .j a v a2 s. c om*/ //System.out.println("pian du"+preference); for (APCMatrixEntryWritable element : values) { outS.setQuick(element.getCol(), element.getValS()); outA.setQuick(element.getCol(), 0.0); outR.setQuick(element.getCol(), 0.0); } //Place preferences on the diagonal of S outS.setQuick(row.get(), preference); SequentialAccessSparseVector outputS = new SequentialAccessSparseVector(outS); SequentialAccessSparseVector outputA = new SequentialAccessSparseVector(outA); SequentialAccessSparseVector outputR = new SequentialAccessSparseVector(outR); APCRowVectorWritable rowVectorWritable = new APCRowVectorWritable(outputA, outputR, outputS); //System.out.println(outputS); context.write(row, rowVectorWritable); }
From source file:root.input.lyrl2004.FormatVectorsJob.java
License:Apache License
/** * {@inheritDoc}/*from w w w. j a v a 2 s. com*/ */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path in = new Path(inputDirectory); Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId"); Path vectorFile = new Path(vectorDirectory + "/part-r-00000"); @SuppressWarnings("resource") SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class, Text.class); @SuppressWarnings("resource") SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class, VectorWritable.class); FileStatus[] files = inputFS.listStatus(in); int counter = 0; for (FileStatus f : files) { Path curr = f.getPath(); if (curr.getName().startsWith(".")) { throw new Exception("Bad Data: Hidden Files Exist"); } Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr)))); while (sc.hasNext()) { String key = sc.next(); RandomAccessSparseVector vector = new RandomAccessSparseVector(10000); String line = sc.nextLine().trim(); Scanner lineScanner = new Scanner(line); while (lineScanner.hasNext()) { String pair = lineScanner.next(); int k = Integer.valueOf(pair.split(":")[0]); double v = Double.valueOf(pair.split(":")[1]); vector.setQuick(k, v); } String nextName = counter + ""; String nextFileName = "/" + counter; counter++; VectorWritable vec = new VectorWritable(); vec.set(vector); vectorWriter.append(new Text(nextFileName), vec); metadataWriter.append(new Text(key), new Text(nextName)); lineScanner.close(); } sc.close(); } metadataWriter.close(); vectorWriter.close(); return 0; }
From source file:root.input.points.FormatVectorsJob.java
License:Apache License
/** * This method allows the Job to act as a {@link ToolRunner} and * interface properly with the Driver.// w w w . j a v a2s . co m * * @param args Configuration arguments * @return Exit status * @see ToolRunner */ @Override public int run(String[] args) throws Exception { addArguments(); if (parseArguments(args) == null) { return -1; } initArguments(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path in = new Path(inputDirectory); Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId"); Path vectorFile = new Path(vectorDirectory + "/part-r-00000"); @SuppressWarnings("resource") SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class, Text.class); @SuppressWarnings("resource") SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class, VectorWritable.class); FileStatus[] files = inputFS.listStatus(in); int counter = 0; for (FileStatus f : files) { Path curr = f.getPath(); if (curr.getName().startsWith(".")) { throw new Exception("Bad Data: Hidden Files Exist"); } Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr)))); while (sc.hasNext()) { String line = sc.nextLine(); RandomAccessSparseVector vector = new RandomAccessSparseVector(10000); String[] split = line.split(","); double val1 = Double.valueOf(split[0]); double val2 = Double.valueOf(split[1]); int val3 = Integer.valueOf(split[2]); vector.setQuick(0, val1); vector.setQuick(1, val2); String nextName = counter + ""; String nextFileName = "/" + counter; counter++; VectorWritable vec = new VectorWritable(); vec.set(vector); vectorWriter.append(new Text(nextFileName), vec); String point = "{x:" + val1 + ",y:" + val2 + ",cluster:" + val3 + "}"; metadataWriter.append(new Text(point), new Text(nextName)); } sc.close(); } metadataWriter.close(); vectorWriter.close(); return 0; }