Example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileIterable SequenceFileIterable

List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileIterable SequenceFileIterable

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileIterable SequenceFileIterable.

Prototype

public SequenceFileIterable(Path path, boolean reuseKeyValueInstances, Configuration conf) 

Source Link

Usage

From source file:ClassifierHD.java

License:Apache License

public static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) {
    Map<String, Integer> dictionnary = new HashMap<String, Integer>();
    for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true,
            conf)) {/*from w  w w.  j  a  va  2 s.  c  o m*/
        dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
    }
    return dictionnary;
}

From source file:ClassifierHD.java

License:Apache License

public static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) {
    Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
    for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>(
            documentFrequencyPath, true, conf)) {
        documentFrequency.put(pair.getFirst().get(), pair.getSecond().get());
    }/*from w  ww.  jav a 2s .  co  m*/
    return documentFrequency;
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth.FPGrowth.java

License:Apache License

public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Configuration conf, Path path) {
    List<Pair<String, TopKStringPatterns>> ret = Lists.newArrayList();
    // key is feature value is count
    for (Pair<Writable, TopKStringPatterns> record : new SequenceFileIterable<Writable, TopKStringPatterns>(
            path, true, conf)) {// w  w w .  ja v  a2 s .  co  m
        ret.add(new Pair<String, TopKStringPatterns>(record.getFirst().toString(),
                new TopKStringPatterns(record.getSecond().getPatterns())));
    }
    return ret;
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * Generates the fList from the serialized string representation
 * /*w w w .j  av a 2  s . co  m*/
 * @return Deserialized Feature Frequency List
 */
public static List<Pair<String, Long>> readFList(Configuration conf) throws IOException {
    List<Pair<String, Long>> list = Lists.newArrayList();

    Path[] files = HadoopUtil.getCachedFiles(conf);
    if (files.length != 1) {
        throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ')');
    }

    for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(files[0], true, conf)) {
        list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get()));
    }
    return list;
}

From source file:com.cg.mapreduce.myfpgrowth.ParallelFPGrowthMapper.java

License:Apache License

/**
 * Generates the fList from the serialized string representation
 * /*from w w w .j ava 2s . co m*/
 * @return Deserialized Feature Frequency List
 */
public List<Pair<String, Long>> readFList(Configuration conf) throws IOException {
    List<Pair<String, Long>> list = Lists.newArrayList();

    Path[] files = HadoopUtil.getCachedFiles(conf);
    if (files.length != 1) {
        throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ')');
    }

    for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(files[0], true, conf)) {
        list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get()));
    }
    return list;
}

From source file:com.chimpler.example.bayes.TopCategoryWords.java

License:Apache License

public static Map<Integer, String> readInverseDictionnary(Configuration conf, Path dictionnaryPath) {
    Map<Integer, String> inverseDictionnary = new HashMap<Integer, String>();
    for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true,
            conf)) {/*from   ww  w.j  a  v a2s  . c om*/
        inverseDictionnary.put(pair.getSecond().get(), pair.getFirst().toString());
    }
    return inverseDictionnary;
}

From source file:com.elex.dmp.core.TopicModel.java

License:Apache License

public static Pair<Matrix, Vector> loadModel(Configuration conf, Path... modelPaths) throws IOException {
    int numTopics = -1;
    int numTerms = -1;
    List<Pair<Integer, Vector>> rows = Lists.newArrayList();
    for (Path modelPath : modelPaths) {
        for (Pair<Text, VectorWritable> row : new SequenceFileIterable<Text, VectorWritable>(modelPath, true,
                conf)) {/*from   www.  j a  v  a2  s. c  o  m*/
            rows.add(Pair.of(Integer.parseInt(row.getFirst().toString()), row.getSecond().get()));//keytext
            numTopics = Math.max(numTopics, Integer.parseInt(row.getFirst().toString()));//keytext
            if (numTerms < 0) {
                numTerms = row.getSecond().get().size();
            }
        }
    }
    if (rows.isEmpty()) {
        throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it");
    }
    numTopics++;
    Matrix model = new DenseMatrix(numTopics, numTerms);
    Vector topicSums = new DenseVector(numTopics);
    for (Pair<Integer, Vector> pair : rows) {
        model.viewRow(pair.getFirst()).assign(pair.getSecond());
        topicSums.set(pair.getFirst(), pair.getSecond().norm(1));
    }
    return Pair.of(model, topicSums);
}

From source file:com.elex.dmp.lda.InMemoryCollapsedVariationalBayes0.java

License:Apache License

private static String[] loadDictionary(String dictionaryPath, Configuration conf) {
    if (dictionaryPath == null) {
        return null;
    }/*from   w  w w .j  a  v  a  2  s  . c o m*/
    Path dictionaryFile = new Path(dictionaryPath);
    List<Pair<Integer, String>> termList = Lists.newArrayList();
    int maxTermId = 0;
    // key is word value is id
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile,
            true, conf)) {
        termList.add(new Pair<Integer, String>(record.getSecond().get(), record.getFirst().toString()));
        maxTermId = Math.max(maxTermId, record.getSecond().get());
    }
    String[] terms = new String[maxTermId + 1];
    for (Pair<Integer, String> pair : termList) {
        terms[pair.getFirst()] = pair.getSecond();
    }
    return terms;
}

From source file:com.elex.dmp.lda.InMemoryCollapsedVariationalBayes0.java

License:Apache License

private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException {
    Path vectorPath = new Path(vectorPathString);
    FileSystem fs = vectorPath.getFileSystem(conf);
    List<Path> subPaths = Lists.newArrayList();
    if (fs.isFile(vectorPath)) {
        subPaths.add(vectorPath);/* w  ww  . j a  v  a  2s. co  m*/
    } else {
        for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
            subPaths.add(fileStatus.getPath());
        }
    }
    List<Vector> vectorList = Lists.newArrayList();
    for (Path subPath : subPaths) {
        for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>(
                subPath, true, conf)) {
            vectorList.add(record.getSecond().get());
        }
    }
    int numRows = vectorList.size();
    int numCols = vectorList.get(0).size();
    return new SparseRowMatrix(numRows, numCols, vectorList.toArray(new Vector[vectorList.size()]), true,
            vectorList.get(0).isSequentialAccess());
}

From source file:com.elex.dmp.lda.TopicModel.java

License:Apache License

public static Pair<Matrix, Vector> loadModel(Configuration conf, Path... modelPaths) throws IOException {
    int numTopics = -1;
    int numTerms = -1;
    List<Pair<Integer, Vector>> rows = Lists.newArrayList();
    for (Path modelPath : modelPaths) {
        for (Pair<Text, VectorWritable> row : new SequenceFileIterable<Text, VectorWritable>(modelPath, true,
                conf)) {/*  w w  w  .j av  a  2 s.co m*/
            rows.add(Pair.of(Integer.parseInt(row.getFirst().toString()), row.getSecond().get()));//keytext
            numTopics = Math.max(numTopics, Integer.parseInt(row.getFirst().toString()));//keytext
            if (numTerms < 0) {
                numTerms = row.getSecond().get().size();
            }
        }
    }
    if (rows.isEmpty()) {
        throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it");
    }
    numTopics++;
    Matrix model = new DenseMatrix(numTopics, numTerms);
    Vector topicSums = new DenseVector(numTopics);
    for (Pair<Integer, Vector> pair : rows) {
        model.viewRow(pair.getFirst()).assign(pair.getSecond());
        topicSums.set(pair.getFirst(), pair.getSecond().norm(1));
    }
    return Pair.of(model, topicSums);
}