List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileIterable SequenceFileIterable
public SequenceFileIterable(Path path, boolean reuseKeyValueInstances, Configuration conf)
From source file:ClassifierHD.java
License:Apache License
public static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) { Map<String, Integer> dictionnary = new HashMap<String, Integer>(); for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) {/*from w w w. j a va 2 s. c o m*/ dictionnary.put(pair.getFirst().toString(), pair.getSecond().get()); } return dictionnary; }
From source file:ClassifierHD.java
License:Apache License
public static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) { Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>(); for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>( documentFrequencyPath, true, conf)) { documentFrequency.put(pair.getFirst().get(), pair.getSecond().get()); }/*from w ww. jav a 2s . co m*/ return documentFrequency; }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth.FPGrowth.java
License:Apache License
public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Configuration conf, Path path) { List<Pair<String, TopKStringPatterns>> ret = Lists.newArrayList(); // key is feature value is count for (Pair<Writable, TopKStringPatterns> record : new SequenceFileIterable<Writable, TopKStringPatterns>( path, true, conf)) {// w w w . ja v a2 s . co m ret.add(new Pair<String, TopKStringPatterns>(record.getFirst().toString(), new TopKStringPatterns(record.getSecond().getPatterns()))); } return ret; }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * Generates the fList from the serialized string representation * /*w w w .j av a 2 s . co m*/ * @return Deserialized Feature Frequency List */ public static List<Pair<String, Long>> readFList(Configuration conf) throws IOException { List<Pair<String, Long>> list = Lists.newArrayList(); Path[] files = HadoopUtil.getCachedFiles(conf); if (files.length != 1) { throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ')'); } for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(files[0], true, conf)) { list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get())); } return list; }
From source file:com.cg.mapreduce.myfpgrowth.ParallelFPGrowthMapper.java
License:Apache License
/** * Generates the fList from the serialized string representation * /*from w w w .j ava 2s . co m*/ * @return Deserialized Feature Frequency List */ public List<Pair<String, Long>> readFList(Configuration conf) throws IOException { List<Pair<String, Long>> list = Lists.newArrayList(); Path[] files = HadoopUtil.getCachedFiles(conf); if (files.length != 1) { throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ')'); } for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(files[0], true, conf)) { list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get())); } return list; }
From source file:com.chimpler.example.bayes.TopCategoryWords.java
License:Apache License
public static Map<Integer, String> readInverseDictionnary(Configuration conf, Path dictionnaryPath) { Map<Integer, String> inverseDictionnary = new HashMap<Integer, String>(); for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) {/*from ww w.j a v a2s . c om*/ inverseDictionnary.put(pair.getSecond().get(), pair.getFirst().toString()); } return inverseDictionnary; }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
public static Pair<Matrix, Vector> loadModel(Configuration conf, Path... modelPaths) throws IOException { int numTopics = -1; int numTerms = -1; List<Pair<Integer, Vector>> rows = Lists.newArrayList(); for (Path modelPath : modelPaths) { for (Pair<Text, VectorWritable> row : new SequenceFileIterable<Text, VectorWritable>(modelPath, true, conf)) {/*from www. j a v a2 s. c o m*/ rows.add(Pair.of(Integer.parseInt(row.getFirst().toString()), row.getSecond().get()));//keytext numTopics = Math.max(numTopics, Integer.parseInt(row.getFirst().toString()));//keytext if (numTerms < 0) { numTerms = row.getSecond().get().size(); } } } if (rows.isEmpty()) { throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it"); } numTopics++; Matrix model = new DenseMatrix(numTopics, numTerms); Vector topicSums = new DenseVector(numTopics); for (Pair<Integer, Vector> pair : rows) { model.viewRow(pair.getFirst()).assign(pair.getSecond()); topicSums.set(pair.getFirst(), pair.getSecond().norm(1)); } return Pair.of(model, topicSums); }
From source file:com.elex.dmp.lda.InMemoryCollapsedVariationalBayes0.java
License:Apache License
private static String[] loadDictionary(String dictionaryPath, Configuration conf) { if (dictionaryPath == null) { return null; }/*from w w w .j a v a 2 s . c o m*/ Path dictionaryFile = new Path(dictionaryPath); List<Pair<Integer, String>> termList = Lists.newArrayList(); int maxTermId = 0; // key is word value is id for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) { termList.add(new Pair<Integer, String>(record.getSecond().get(), record.getFirst().toString())); maxTermId = Math.max(maxTermId, record.getSecond().get()); } String[] terms = new String[maxTermId + 1]; for (Pair<Integer, String> pair : termList) { terms[pair.getFirst()] = pair.getSecond(); } return terms; }
From source file:com.elex.dmp.lda.InMemoryCollapsedVariationalBayes0.java
License:Apache License
private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException { Path vectorPath = new Path(vectorPathString); FileSystem fs = vectorPath.getFileSystem(conf); List<Path> subPaths = Lists.newArrayList(); if (fs.isFile(vectorPath)) { subPaths.add(vectorPath);/* w ww . j a v a 2s. co m*/ } else { for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) { subPaths.add(fileStatus.getPath()); } } List<Vector> vectorList = Lists.newArrayList(); for (Path subPath : subPaths) { for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>( subPath, true, conf)) { vectorList.add(record.getSecond().get()); } } int numRows = vectorList.size(); int numCols = vectorList.get(0).size(); return new SparseRowMatrix(numRows, numCols, vectorList.toArray(new Vector[vectorList.size()]), true, vectorList.get(0).isSequentialAccess()); }
From source file:com.elex.dmp.lda.TopicModel.java
License:Apache License
public static Pair<Matrix, Vector> loadModel(Configuration conf, Path... modelPaths) throws IOException { int numTopics = -1; int numTerms = -1; List<Pair<Integer, Vector>> rows = Lists.newArrayList(); for (Path modelPath : modelPaths) { for (Pair<Text, VectorWritable> row : new SequenceFileIterable<Text, VectorWritable>(modelPath, true, conf)) {/* w w w .j av a 2 s.co m*/ rows.add(Pair.of(Integer.parseInt(row.getFirst().toString()), row.getSecond().get()));//keytext numTopics = Math.max(numTopics, Integer.parseInt(row.getFirst().toString()));//keytext if (numTerms < 0) { numTerms = row.getSecond().get().size(); } } } if (rows.isEmpty()) { throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it"); } numTopics++; Matrix model = new DenseMatrix(numTopics, numTerms); Vector topicSums = new DenseVector(numTopics); for (Pair<Integer, Vector> pair : rows) { model.viewRow(pair.getFirst()).assign(pair.getSecond()); topicSums.set(pair.getFirst(), pair.getSecond().norm(1)); } return Pair.of(model, topicSums); }