Example usage for org.apache.mahout.math RandomAccessSparseVector setQuick

Introduction

In this page you can find the example usage for org.apache.mahout.math RandomAccessSparseVector setQuick.

Prototype

@Override
    public void setQuick(int index, double value)

Source Link

Usage

From source file:com.scaleunlimited.classify.vectors.SetNormalizerTest.java

License:Apache License

@Test
public void testNormalization() {

    BaseNormalizer normalizer = new SetNormalizer();

    RandomAccessSparseVector v = new RandomAccessSparseVector(3);
    v.setQuick(0, 2.0);
    v.setQuick(1, 8.0);//  w  w  w.j  a va  2s.  c om
    v.setQuick(2, 0.0);
    normalizer.normalize(v);

    assertEquals(0.5, v.get(0), 0.001);
    assertEquals(0.5, v.get(1), 0.001);
    assertEquals(0.0, v.get(2), 0.001);

    v = new RandomAccessSparseVector(3);
    v.setQuick(0, 4.0);
    v.setQuick(1, 0.0);
    v.setQuick(2, 0.0);
    normalizer.normalize(v);

    assertEquals(1.0, v.get(0), 0.001);
    assertEquals(0.0, v.get(1), 0.001);
    assertEquals(0.0, v.get(2), 0.001);
}

From source file:com.scaleunlimited.classify.vectors.TfNormalizerTest.java

License:Apache License

@Test
public void testNormalization() {

    BaseNormalizer normalizer = new TfNormalizer();

    RandomAccessSparseVector v = new RandomAccessSparseVector(3);
    v.setQuick(0, 2.0);
    v.setQuick(1, 8.0);/*from   w  ww . j av  a  2  s.c om*/
    v.setQuick(2, 0.0);
    normalizer.normalize(v);

    assertEquals(0.2, v.get(0), 0.001);
    assertEquals(0.8, v.get(1), 0.001);
    assertEquals(0.0, v.get(2), 0.001);

    v = new RandomAccessSparseVector(3);
    v.setQuick(0, 4.0);
    v.setQuick(1, 0.0);
    v.setQuick(2, 0.0);
    normalizer.normalize(v);

    assertEquals(1.0, v.get(0), 0.001);
    assertEquals(0.0, v.get(1), 0.001);
    assertEquals(0.0, v.get(2), 0.001);
}

From source file:com.scaleunlimited.classify.vectors.UnitNormalizerTest.java

License:Apache License

@Test
public void testNormalization() {
    BaseNormalizer normalizer = new UnitNormalizer();

    RandomAccessSparseVector v = new RandomAccessSparseVector(3);
    v.setQuick(0, 2.0);
    v.setQuick(1, 8.0);//w  ww.  j  av a2  s  . co  m
    v.setQuick(2, 0.0);
    normalizer.normalize(v);
    assertEquals(1.0, v.getLengthSquared(), 0.001);

    v = new RandomAccessSparseVector(3);
    v.setQuick(0, 4.0);
    v.setQuick(1, 0.0);
    v.setQuick(2, 0.0);
    normalizer.normalize(v);

    assertEquals(1.0, v.getLengthSquared(), 0.001);
}

From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java

License:Open Source License

public void vectorize(File luceneIndexDir, File outputDir) throws Exception {

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    SequenceFile.Writer writer = null;

    FeatureDictionary dict = new FeatureDictionary();

    DirectoryReader reader = null;//from  ww  w  . j a  va  2  s.  c  om
    try {
        reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir));

        writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"),
                IDAndCodes.class, VectorWritable.class);
        IDAndCodes idAndCodes = new IDAndCodes();
        VectorWritable vectorWritable = new VectorWritable();

        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Iterator<String> fieldNames = fields.iterator();
            while (fieldNames.hasNext()) {
                String field = fieldNames.next();
                if (!field.startsWith("bip:") && !"itemID".equals(field)) {

                    Terms terms = fields.terms(field);
                    TermsEnum termsEnum = terms.iterator(null);
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        dict.addTextFeature(field, text.utf8ToString());
                    }
                }
            }
        }

        int numDocsVectorized = 0;

        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            Document doc = reader.document(docID);

            int itemID = doc.getField("itemID").numericValue().intValue();

            RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures());
            Multimap<String, String> codes = HashMultimap.create();

            for (IndexableField field : doc.getFields()) {

                String fieldName = field.name();

                if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) {

                    Terms termFreqVector = reader.getTermVector(docID, fieldName);

                    if (termFreqVector != null) {

                        int maxTermFrequency = maxTermFrequency(termFreqVector);

                        TermsEnum te = termFreqVector.iterator(null);
                        BytesRef term;

                        while ((term = te.next()) != null) {

                            String termStr = term.utf8ToString();
                            int termFrequency = (int) te.totalTermFreq();

                            int documentFrequency = reader.docFreq(new Term(fieldName, term));
                            int numDocs = reader.numDocs();

                            double weight = weighting.weight(fieldName, termStr, termFrequency,
                                    documentFrequency, maxTermFrequency, numDocs);

                            int featureIndex = dict.index(fieldName, term.utf8ToString());
                            documentVector.setQuick(featureIndex, weight);
                        }
                    }

                } else if (fieldName.startsWith("bip:")) {
                    for (String value : doc.getValues(fieldName)) {
                        codes.put(fieldName, value);
                    }
                }
            }

            Vector featureVector = new SequentialAccessSparseVector(documentVector);

            weighting.normalize(featureVector);

            idAndCodes.set(itemID, codes);
            vectorWritable.set(featureVector);
            writer.append(idAndCodes, vectorWritable);

            numDocsVectorized++;
            if (numDocsVectorized % 100 == 0) {
                log.info("Vectorized {} documents", numDocsVectorized);
            }
        }

        log.info("Vectorized {} documents", numDocsVectorized);

        dict.writeToFile(new File(outputDir, "features.txt"));

        log.info("Wrote feature dictionary");

    } finally {
        Closeables.close(reader, true);
        Closeables.close(writer, true);
    }

}

From source file:org.swjtu.helloworldcn.APCMatrixInputReducer.java

License:Apache License

@Override
protected void reduce(IntWritable row, Iterable<APCMatrixEntryWritable> values, Context context)
        throws IOException, InterruptedException {
    int size = context.getConfiguration().getInt(APCMatrixInputJob.MATRIX_DIMENSIONS, Integer.MAX_VALUE);
    RandomAccessSparseVector outS = new RandomAccessSparseVector(size, 100);
    RandomAccessSparseVector outA = new RandomAccessSparseVector(size, 100);
    RandomAccessSparseVector outR = new RandomAccessSparseVector(size, 100);

    Configuration conf = context.getConfiguration();
    if (preference == null) {
        preference = Double.parseDouble(conf.get(APCMatrixInputJob.TEMPORARY_SAVE_PREFERENCE));
    }/*from  w w  w  .j a v a2 s. c  om*/

    //System.out.println("pian du"+preference);

    for (APCMatrixEntryWritable element : values) {
        outS.setQuick(element.getCol(), element.getValS());
        outA.setQuick(element.getCol(), 0.0);
        outR.setQuick(element.getCol(), 0.0);

    }
    //Place preferences on the diagonal of S
    outS.setQuick(row.get(), preference);

    SequentialAccessSparseVector outputS = new SequentialAccessSparseVector(outS);
    SequentialAccessSparseVector outputA = new SequentialAccessSparseVector(outA);
    SequentialAccessSparseVector outputR = new SequentialAccessSparseVector(outR);
    APCRowVectorWritable rowVectorWritable = new APCRowVectorWritable(outputA, outputR, outputS);
    //System.out.println(outputS);
    context.write(row, rowVectorWritable);
}

From source file:root.input.lyrl2004.FormatVectorsJob.java

License:Apache License

/**
 * {@inheritDoc}/*from   w w  w.  j  a  v a 2  s.  com*/
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path in = new Path(inputDirectory);
    Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId");
    Path vectorFile = new Path(vectorDirectory + "/part-r-00000");

    @SuppressWarnings("resource")
    SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class,
            Text.class);

    @SuppressWarnings("resource")
    SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class,
            VectorWritable.class);

    FileStatus[] files = inputFS.listStatus(in);

    int counter = 0;

    for (FileStatus f : files) {
        Path curr = f.getPath();
        if (curr.getName().startsWith(".")) {
            throw new Exception("Bad Data: Hidden Files Exist");
        }

        Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr))));

        while (sc.hasNext()) {

            String key = sc.next();

            RandomAccessSparseVector vector = new RandomAccessSparseVector(10000);

            String line = sc.nextLine().trim();
            Scanner lineScanner = new Scanner(line);
            while (lineScanner.hasNext()) {

                String pair = lineScanner.next();

                int k = Integer.valueOf(pair.split(":")[0]);
                double v = Double.valueOf(pair.split(":")[1]);

                vector.setQuick(k, v);

            }

            String nextName = counter + "";
            String nextFileName = "/" + counter;
            counter++;

            VectorWritable vec = new VectorWritable();
            vec.set(vector);
            vectorWriter.append(new Text(nextFileName), vec);

            metadataWriter.append(new Text(key), new Text(nextName));

            lineScanner.close();

        }

        sc.close();
    }

    metadataWriter.close();
    vectorWriter.close();

    return 0;

}

From source file:root.input.points.FormatVectorsJob.java

License:Apache License

/**
 * This method allows the Job to act as a {@link ToolRunner} and 
 * interface properly with the Driver.// w w w .  j a v  a2s .  co m
 * 
 * @param args Configuration arguments
 * @return Exit status
 * @see ToolRunner
 */
@Override
public int run(String[] args) throws Exception {

    addArguments();

    if (parseArguments(args) == null) {
        return -1;
    }

    initArguments();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path in = new Path(inputDirectory);
    Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId");
    Path vectorFile = new Path(vectorDirectory + "/part-r-00000");

    @SuppressWarnings("resource")
    SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class,
            Text.class);

    @SuppressWarnings("resource")
    SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class,
            VectorWritable.class);

    FileStatus[] files = inputFS.listStatus(in);

    int counter = 0;

    for (FileStatus f : files) {
        Path curr = f.getPath();
        if (curr.getName().startsWith(".")) {
            throw new Exception("Bad Data: Hidden Files Exist");
        }

        Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr))));

        while (sc.hasNext()) {

            String line = sc.nextLine();

            RandomAccessSparseVector vector = new RandomAccessSparseVector(10000);

            String[] split = line.split(",");

            double val1 = Double.valueOf(split[0]);
            double val2 = Double.valueOf(split[1]);
            int val3 = Integer.valueOf(split[2]);

            vector.setQuick(0, val1);
            vector.setQuick(1, val2);

            String nextName = counter + "";
            String nextFileName = "/" + counter;
            counter++;

            VectorWritable vec = new VectorWritable();
            vec.set(vector);
            vectorWriter.append(new Text(nextFileName), vec);

            String point = "{x:" + val1 + ",y:" + val2 + ",cluster:" + val3 + "}";

            metadataWriter.append(new Text(point), new Text(nextName));

        }

        sc.close();
    }

    metadataWriter.close();
    vectorWriter.close();

    return 0;

}