Example usage for org.apache.mahout.vectorizer.encoders FeatureVectorEncoder setTraceDictionary

Introduction

In this page you can find the example usage for org.apache.mahout.vectorizer.encoders FeatureVectorEncoder setTraceDictionary.

Prototype

public void setTraceDictionary(Map<String, Set<Integer>> traceDictionary)

Source Link

Usage

From source file:chapter4.src.logistic.CsvRecordFactoryPredict.java

License:Apache License

/**
 * Processes the first line of a file (which should contain the variable names). The target and
 * predictor column numbers are set from the names on this line.
 *
 * @param line       Header line for the file.
 *///from  w  ww.j av  a 2  s  . c om

public void firstLine(String line) {
    // read variable names, build map of name -> column
    final Map<String, Integer> vars = Maps.newHashMap();
    variableNames = parseCsvLine(line);
    int column = 0;
    for (String var : variableNames) {
        vars.put(var, column++);
    }

    // record target column and establish dictionary for decoding target
    target = vars.get(targetName);

    // record id column
    if (idName != null) {
        id = vars.get(idName);
    }

    // create list of predictor column numbers
    predictors = Lists.newArrayList(Collections2.transform(typeMap.keySet(), new Function<String, Integer>() {

        public Integer apply(String from) {
            Integer r = vars.get(from);
            Preconditions.checkArgument(r != null, "Can't find variable %s, only know about %s", from, vars);
            return r;
        }
    }));

    if (includeBiasTerm) {
        predictors.add(-1);
    }
    Collections.sort(predictors);

    // and map from column number to type encoder for each column that is a predictor
    predictorEncoders = Maps.newHashMap();
    for (Integer predictor : predictors) {
        String name;
        Class<? extends FeatureVectorEncoder> c;
        if (predictor == -1) {
            name = INTERCEPT_TERM;
            c = ConstantValueEncoder.class;
        } else {
            name = variableNames.get(predictor);
            c = TYPE_DICTIONARY.get(typeMap.get(name));
        }
        try {
            Preconditions.checkArgument(c != null, "Invalid type of variable %s,  wanted one of %s",
                    typeMap.get(name), TYPE_DICTIONARY.keySet());
            Constructor<? extends FeatureVectorEncoder> constructor = c.getConstructor(String.class);
            Preconditions.checkArgument(constructor != null, "Can't find correct constructor for %s",
                    typeMap.get(name));
            FeatureVectorEncoder encoder = constructor.newInstance(name);
            predictorEncoders.put(predictor, encoder);
            encoder.setTraceDictionary(traceDictionary);
        } catch (InstantiationException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (IllegalAccessException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (InvocationTargetException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (NoSuchMethodException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        }
    }
}

From source file:chapter4.src.logistic.CsvRecordFactoryPredict.java

License:Apache License

public void firstLine(String line, String targetName) {
    // read variable names, build map of name -> column
    final Map<String, Integer> vars = Maps.newHashMap();
    variableNames = parseCsvLine(line);/* ww  w.ja v a 2s.  c o  m*/
    int column = 0;
    for (String var : variableNames) {
        vars.put(var, column++);
    }

    // record target column and establish dictionary for decoding target
    target = vars.size() + 1;

    // record id column
    if (idName != null) {
        id = vars.get(idName);
    }

    // create list of predictor column numbers
    predictors = Lists.newArrayList(Collections2.transform(typeMap.keySet(), new Function<String, Integer>() {

        public Integer apply(String from) {
            Integer r = vars.get(from);
            Preconditions.checkArgument(r != null, "Can't find variable %s, only know about %s", from, vars);
            return r;
        }
    }));

    if (includeBiasTerm) {
        predictors.add(-1);
    }
    Collections.sort(predictors);

    // and map from column number to type encoder for each column that is a predictor
    predictorEncoders = Maps.newHashMap();
    for (Integer predictor : predictors) {
        String name;
        Class<? extends FeatureVectorEncoder> c;
        if (predictor == -1) {
            name = INTERCEPT_TERM;
            c = ConstantValueEncoder.class;
        } else {
            name = variableNames.get(predictor);
            c = TYPE_DICTIONARY.get(typeMap.get(name));
        }
        try {
            Preconditions.checkArgument(c != null, "Invalid type of variable %s,  wanted one of %s",
                    typeMap.get(name), TYPE_DICTIONARY.keySet());
            Constructor<? extends FeatureVectorEncoder> constructor = c.getConstructor(String.class);
            Preconditions.checkArgument(constructor != null, "Can't find correct constructor for %s",
                    typeMap.get(name));
            FeatureVectorEncoder encoder = constructor.newInstance(name);
            predictorEncoders.put(predictor, encoder);
            encoder.setTraceDictionary(traceDictionary);
        } catch (InstantiationException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (IllegalAccessException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (InvocationTargetException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (NoSuchMethodException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        }
    }
}

From source file:com.cloudera.knittingboar.records.CSVBasedDatasetRecordFactory.java

License:Apache License

/**
 * Processes the first line of a file (which should contain the variable
 * names). The target and predictor column numbers are set from the names on
 * this line./*  www.  ja v  a  2  s . c  o  m*/
 * 
 * @param line
 *          Header line for the file.
 */
public void firstLine(String line) {

    // System.out.println("> firstline: " + line);

    // read variable names, build map of name -> column
    final Map<String, Integer> vars = Maps.newHashMap();
    variableNames = Lists.newArrayList(COMMA.split(line));
    int column = 0;
    for (String var : variableNames) {
        vars.put(var, column++);
    }

    // record target column and establish dictionary for decoding target
    target = vars.get(targetName);

    // record id column
    if (idName != null) {
        id = vars.get(idName);
    }

    // create list of predictor column numbers
    predictors = Lists.newArrayList(Collections2.transform(typeMap.keySet(), new Function<String, Integer>() {
        @Override
        public Integer apply(String from) {
            Integer r = vars.get(from);
            Preconditions.checkArgument(r != null, "Can't find variable %s, only know about %s", from, vars);
            return r;
        }
    }));

    if (includeBiasTerm) {
        predictors.add(-1);
    }
    Collections.sort(predictors);

    // and map from column number to type encoder for each column that is a
    // predictor
    predictorEncoders = Maps.newHashMap();
    for (Integer predictor : predictors) {
        String name;
        Class<? extends FeatureVectorEncoder> c;
        if (predictor == -1) {
            name = INTERCEPT_TERM;
            c = ConstantValueEncoder.class;
        } else {
            name = variableNames.get(predictor);
            c = TYPE_DICTIONARY.get(typeMap.get(name));
        }
        try {
            Preconditions.checkArgument(c != null, "Invalid type of variable %s,  wanted one of %s",
                    typeMap.get(name), TYPE_DICTIONARY.keySet());
            Constructor<? extends FeatureVectorEncoder> constructor = c.getConstructor(String.class);
            Preconditions.checkArgument(constructor != null, "Can't find correct constructor for %s",
                    typeMap.get(name));
            FeatureVectorEncoder encoder = constructor.newInstance(name);
            predictorEncoders.put(predictor, encoder);
            encoder.setTraceDictionary(traceDictionary);
        } catch (InstantiationException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (IllegalAccessException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (InvocationTargetException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        } catch (NoSuchMethodException e) {
            throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
        }
    }
}

From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java

License:Apache License

public void test20NewsgroupsFileScan() throws IOException {

    // p.270 ----- metrics to track lucene's parsing mechanics, progress, performance of OLR ------------
    double averageLL = 0.0;
    double averageCorrect = 0.0;
    double averageLineCount = 0.0;
    int k = 0;//  w w w  .ja  va 2 s .  co  m
    double step = 0.0;
    int[] bumps = new int[] { 1, 2, 5 };
    double lineCount = 0;

    Splitter onColon = Splitter.on(":").trimResults();
    // last line on p.269
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);

    File base = new File("/Users/jpatterson/Downloads/datasets/20news-bydate/20-debug/");
    overallCounts = HashMultiset.create();

    // p.269 ---------------------------------------------------------
    Map<String, Set<Integer>> traceDictionary = new TreeMap<String, Set<Integer>>();

    // encodes the text content in both the subject and the body of the email
    FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
    encoder.setProbes(2);
    encoder.setTraceDictionary(traceDictionary);

    // provides a constant offset that the model can use to encode the average frequency 
    // of each class
    FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
    bias.setTraceDictionary(traceDictionary);

    // used to encode the number of lines in a message
    FeatureVectorEncoder lines = new ConstantValueEncoder("Lines");
    lines.setTraceDictionary(traceDictionary);
    Dictionary newsGroups = new Dictionary();

    // bottom of p.269 ------------------------------
    // because OLR expects to get integer class IDs for the target variable during training
    // we need a dictionary to convert the target variable (the newsgroup name)
    // to an integer, which is the newsGroup object
    List<File> files = new ArrayList<File>();
    for (File newsgroup : base.listFiles()) {
        newsGroups.intern(newsgroup.getName());
        System.out.println(">> " + newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
    }

    // mix up the files, helps training in OLR
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());

    // ----- p.270 ------------ "reading and tokenzing the data" ---------
    for (File file : files) {
        BufferedReader reader = new BufferedReader(new FileReader(file));

        // identify newsgroup ----------------
        // convert newsgroup name to unique id
        // -----------------------------------
        String ng = file.getParentFile().getName();
        int actual = newsGroups.intern(ng);
        Multiset<String> words = ConcurrentHashMultiset.create();

        // check for line count header -------
        String line = reader.readLine();
        while (line != null && line.length() > 0) {

            // if this is a line that has a line count, let's pull that value out ------
            if (line.startsWith("Lines:")) {
                String count = Iterables.get(onColon.split(line), 1);
                try {
                    lineCount = Integer.parseInt(count);
                    averageLineCount += (lineCount - averageLineCount) / Math.min(k + 1, 1000);
                } catch (NumberFormatException e) {
                    // if anything goes wrong in parse: just use the avg count
                    lineCount = averageLineCount;
                }
            }

            // which header words to actually count
            boolean countHeader = (line.startsWith("From:") || line.startsWith("Subject:")
                    || line.startsWith("Keywords:") || line.startsWith("Summary:"));

            // we're still looking at the header at this point
            // loop through the lines in the file, while the line starts with: " "
            do {

                // get a reader for this specific string ------
                StringReader in = new StringReader(line);

                // ---- count words in header ---------            
                if (countHeader) {
                    //System.out.println( "#### countHeader ################*************" );
                    countWords(analyzer, words, in);
                }

                // iterate to the next string ----
                line = reader.readLine();

            } while (line.startsWith(" "));

            //System.out.println("[break]");

        }

        // now we're done with the header

        //System.out.println("[break-header]");

        //  -------- count words in body ----------
        countWords(analyzer, words, reader);
        reader.close();

        /*        
                for (String word : words.elementSet()) {
                  //encoder.addToVector(word, Math.log(1 + words.count(word)), v);
                 System.out.println( "> " + word + ", " + words.count(word) );
                }        
        */
    }

}

From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java

License:Apache License

/**
 * Processes single line of input into: - target variable - Feature vector
 * //from   w  ww  .  j  a  va 2 s.  c om
 * @throws Exception
 */
public int processLine(String line, Vector v) throws Exception {

    String[] parts = line.split(this.class_id_split_string);
    if (parts.length < 2) {
        throw new Exception("wtf: line not formed well.");
    }

    String newsgroup_name = parts[0];
    String msg = parts[1];

    // p.269 ---------------------------------------------------------
    Map<String, Set<Integer>> traceDictionary = new TreeMap<String, Set<Integer>>();

    // encodes the text content in both the subject and the body of the email
    FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
    encoder.setProbes(2);
    encoder.setTraceDictionary(traceDictionary);

    // provides a constant offset that the model can use to encode the average
    // frequency
    // of each class
    FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
    bias.setTraceDictionary(traceDictionary);

    int actual = newsGroups.intern(newsgroup_name);
    // newsGroups.values().contains(arg0)

    // System.out.println( "> newsgroup name: " + newsgroup_name );
    // System.out.println( "> newsgroup id: " + actual );

    Multiset<String> words = ConcurrentHashMultiset.create();
    /*
     * // System.out.println("record: "); for ( int x = 1; x < parts.length; x++
     * ) { //String s = ts.getAttribute(CharTermAttribute.class).toString(); //
     * System.out.print( " " + parts[x] ); String foo = parts[x].trim();
     * System.out.print( " " + foo ); words.add( foo );
     * 
     * } // System.out.println("\nEOR"); System.out.println( "\nwords found: " +
     * (parts.length - 1) ); System.out.println( "words in set: " + words.size()
     * + ", " + words.toString() );
     */

    StringReader in = new StringReader(msg);

    countWords(analyzer, words, in);

    // ----- p.271 -----------
    // Vector v = new RandomAccessSparseVector(FEATURES);

    // original value does nothing in a ContantValueEncoder
    bias.addToVector("", 1, v);

    // original value does nothing in a ContantValueEncoder
    // lines.addToVector("", lineCount / 30, v);

    // original value does nothing in a ContantValueEncoder
    // logLines.addToVector("", Math.log(lineCount + 1), v);

    // now scan through all the words and add them
    // System.out.println( "############### " + words.toArray().length);
    for (String word : words.elementSet()) {
        encoder.addToVector(word, Math.log(1 + words.count(word)), v);
        // System.out.print( words.count(word) + " " );
    }

    // System.out.println("\nEOL\n");

    return actual;
}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java

License:Apache License

public void testTrainNewsGroups() throws IOException {

    File base = new File("/Users/jpatterson/Downloads/datasets/20news-bydate/20news-bydate-train/");
    overallCounts = HashMultiset.create();

    long startTime = System.currentTimeMillis();

    // p.269 ---------------------------------------------------------
    Map<String, Set<Integer>> traceDictionary = new TreeMap<String, Set<Integer>>();

    // encodes the text content in both the subject and the body of the email
    FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
    encoder.setProbes(2);/*from w  w w .j a v a2s.com*/
    encoder.setTraceDictionary(traceDictionary);

    // provides a constant offset that the model can use to encode the average frequency 
    // of each class
    FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
    bias.setTraceDictionary(traceDictionary);

    // used to encode the number of lines in a message
    FeatureVectorEncoder lines = new ConstantValueEncoder("Lines");
    lines.setTraceDictionary(traceDictionary);

    FeatureVectorEncoder logLines = new ConstantValueEncoder("LogLines");
    logLines.setTraceDictionary(traceDictionary);

    Dictionary newsGroups = new Dictionary();

    // matches the OLR setup on p.269 ---------------
    // stepOffset, decay, and alpha --- describe how the learning rate decreases
    // lambda: amount of regularization
    // learningRate: amount of initial learning rate
    OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1)
            .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20);

    // bottom of p.269 ------------------------------
    // because OLR expects to get integer class IDs for the target variable during training
    // we need a dictionary to convert the target variable (the newsgroup name)
    // to an integer, which is the newsGroup object
    List<File> files = new ArrayList<File>();
    for (File newsgroup : base.listFiles()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
    }

    // mix up the files, helps training in OLR
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());

    // p.270 ----- metrics to track lucene's parsing mechanics, progress, performance of OLR ------------
    double averageLL = 0.0;
    double averageCorrect = 0.0;
    double averageLineCount = 0.0;
    int k = 0;
    double step = 0.0;
    int[] bumps = new int[] { 1, 2, 5 };
    double lineCount = 0;

    // last line on p.269
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);

    Splitter onColon = Splitter.on(":").trimResults();

    int input_file_count = 0;

    // ----- p.270 ------------ "reading and tokenzing the data" ---------
    for (File file : files) {
        BufferedReader reader = new BufferedReader(new FileReader(file));

        input_file_count++;

        // identify newsgroup ----------------
        // convert newsgroup name to unique id
        // -----------------------------------
        String ng = file.getParentFile().getName();
        int actual = newsGroups.intern(ng);
        Multiset<String> words = ConcurrentHashMultiset.create();

        // check for line count header -------
        String line = reader.readLine();
        while (line != null && line.length() > 0) {

            // if this is a line that has a line count, let's pull that value out ------
            if (line.startsWith("Lines:")) {
                String count = Iterables.get(onColon.split(line), 1);
                try {
                    lineCount = Integer.parseInt(count);
                    averageLineCount += (lineCount - averageLineCount) / Math.min(k + 1, 1000);
                } catch (NumberFormatException e) {
                    // if anything goes wrong in parse: just use the avg count
                    lineCount = averageLineCount;
                }
            }

            boolean countHeader = (line.startsWith("From:") || line.startsWith("Subject:")
                    || line.startsWith("Keywords:") || line.startsWith("Summary:"));

            // loop through the lines in the file, while the line starts with: " "
            do {

                // get a reader for this specific string ------
                StringReader in = new StringReader(line);

                // ---- count words in header ---------            
                if (countHeader) {
                    countWords(analyzer, words, in);
                }

                // iterate to the next string ----
                line = reader.readLine();

            } while (line.startsWith(" "));

        } // while (lines in header) {

        //  -------- count words in body ----------
        countWords(analyzer, words, reader);
        reader.close();

        // ----- p.271 -----------
        Vector v = new RandomAccessSparseVector(FEATURES);

        // original value does nothing in a ContantValueEncoder
        bias.addToVector("", 1, v);

        // original value does nothing in a ContantValueEncoder
        lines.addToVector("", lineCount / 30, v);

        // original value does nothing in a ContantValueEncoder        
        logLines.addToVector("", Math.log(lineCount + 1), v);

        // now scan through all the words and add them
        for (String word : words.elementSet()) {
            encoder.addToVector(word, Math.log(1 + words.count(word)), v);
        }

        //Utils.PrintVectorNonZero(v);

        // calc stats ---------

        double mu = Math.min(k + 1, 200);
        double ll = learningAlgorithm.logLikelihood(actual, v);
        averageLL = averageLL + (ll - averageLL) / mu;

        Vector p = new DenseVector(20);
        learningAlgorithm.classifyFull(p, v);
        int estimated = p.maxValueIndex();

        int correct = (estimated == actual ? 1 : 0);
        averageCorrect = averageCorrect + (correct - averageCorrect) / mu;

        learningAlgorithm.train(actual, v);

        k++;

        int bump = bumps[(int) Math.floor(step) % bumps.length];
        int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

        if (k % (bump * scale) == 0) {
            step += 0.25;
            System.out.printf("%10d %10.3f %10.3f %10.2f %s %s\n", k, ll, averageLL, averageCorrect * 100, ng,
                    newsGroups.values().get(estimated));
        }

        learningAlgorithm.close();

        /*    if (k>4) {
              break;
            }
          */

    }

    Utils.PrintVectorSection(learningAlgorithm.getBeta().viewRow(0), 3);

    long endTime = System.currentTimeMillis();

    //System.out.println("That took " + (endTime - startTime) + " milliseconds");
    long duration = (endTime - startTime);

    System.out.println("Processed Input Files: " + input_file_count + ", time: " + duration + "ms");

    ModelSerializer.writeBinary("/tmp/olr-news-group.model", learningAlgorithm);
    // learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));

}