List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable
public SequenceFileDirIterable(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering, boolean reuseKeyValueInstances, Configuration conf)
From source file:ac.keio.sslab.nlp.lda.RowIdJob.java
License:Apache License
@SuppressWarnings("deprecation") @Override//w ww . j ava 2 s . com public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path outputPath = getOutputPath(); Path indexPath = new Path(outputPath, "docIndex"); Path matrixPath = new Path(outputPath, "matrix"); try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, IntWritable.class, Text.class); SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath, IntWritable.class, VectorWritable.class)) { IntWritable docId = new IntWritable(); int i = 0; int numCols = 0; for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>( getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) { VectorWritable value = record.getSecond(); docId.set(i); indexWriter.append(docId, record.getFirst()); matrixWriter.append(docId, value); i++; numCols = value.get().size(); } log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath); return 0; } }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * read the feature frequency List which is built at the end of the Parallel counting job * /*from w w w . ja v a 2 s. c o m*/ * @return Feature Frequency List */ public static List<Pair<String, Long>> readFList(Parameters params) { int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3")); Configuration conf = new Configuration(); Path parallelCountingPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11, new Comparator<Pair<String, Long>>() { @Override public int compare(Pair<String, Long> o1, Pair<String, Long> o2) { int ret = o2.getSecond().compareTo(o1.getSecond()); if (ret != 0) { return ret; } return o1.getFirst().compareTo(o2.getFirst()); } }); for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>( new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) { long value = record.getSecond().get(); if (value >= minSupport) { queue.add(new Pair<String, Long>(record.getFirst().toString(), value)); } } List<Pair<String, Long>> fList = Lists.newArrayList(); while (!queue.isEmpty()) { fList.add(queue.poll()); } return fList; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
/** * @param topicModelStateTemp//from w w w . j av a 2 s.c o m * @param iteration * @return {@code double[2]} where first value is perplexity and second is model weight of those * documents sampled during perplexity computation, or {@code null} if no perplexity data * exists for the given iteration. * @throws IOException */ public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration) throws IOException { Path perplexityPath = perplexityPath(topicModelStateTemp, iteration); FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf); if (!fs.exists(perplexityPath)) { log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath); return Double.NaN; } double perplexity = 0; double modelWeight = 0; long n = 0; for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>( perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { modelWeight += pair.getFirst().get(); perplexity += pair.getSecond().get(); n++; } log.info("Read {} entries with total perplexity {} and model weight {}", new Object[] { n, perplexity, modelWeight }); return perplexity / modelWeight; }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them. * This will use constant memory and will run at the speed of your disk read *///from w w w.j a va 2s. c o m private static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); try { long currentChunkSize = 0; Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN); int i = 0; for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.closeQuietly(dictWriter); chunkIndex++; chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath); dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); currentChunkSize = 0; } Writable key = record.getFirst(); int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8; currentChunkSize += fieldSize; dictWriter.append(key, new IntWritable(i++)); } maxTermDimension[0] = i; } finally { Closeables.closeQuietly(dictWriter); } return chunkPaths; }
From source file:com.pocketx.gravity.common.TasteHadoopUtils.java
License:Apache License
/** * Reads a binary mapping file/*from w w w.ja v a2 s.c o m*/ */ public static OpenIntLongHashMap readItemIDIndexMap(String itemIDIndexPathStr, Configuration conf) { OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap(); Path itemIDIndexPath = new Path(itemIDIndexPathStr); for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>( itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { indexItemIDMap.put(record.getFirst().get(), record.getSecond().get()); } return indexItemIDMap; }
From source file:it.polito.dbdmg.searum.ARM.java
License:Apache License
/** * Read the header table which is built at the end of the Parallel counting * job./*from w ww.j av a2 s . c o m*/ * * @return header table */ public static List<Pair<String, Long>> readFList(Parameters params) { Configuration conf = new Configuration(); Path parallelCountingPath = new Path(params.get(OUTPUT), ITEM_FREQ); PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11, new Comparator<Pair<String, Long>>() { public int compare(Pair<String, Long> o1, Pair<String, Long> o2) { int ret = o2.getSecond().compareTo(o1.getSecond()); if (ret != 0) { return ret; } return o1.getFirst().compareTo(o2.getFirst()); } }); /** * Get absolute support from relative threshold */ Long numTrans = null; for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>( new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) { long value = record.getSecond().get(); String feature = record.getFirst().toString(); if (feature.compareTo("dataset") == 0) { numTrans = value; break; } } Double relativeSupport = Double.valueOf(params.get(MIN_SUPPORT, "0.9")); absSupport = (int) Math.ceil((relativeSupport * numTrans)); log.info("# Transactions: " + numTrans); log.info("Support: " + relativeSupport * 100 + "%"); log.info("Support count: " + absSupport); params.set(MIN_SUPPORT, (new Long(absSupport)).toString()); for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>( new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) { long value = record.getSecond().get(); if (value >= absSupport) { queue.add(new Pair<String, Long>(record.getFirst().toString(), value)); } } List<Pair<String, Long>> fList = Lists.newArrayList(); while (!queue.isEmpty()) { fList.add(queue.poll()); } return fList; }
From source file:mlbench.bayes.BayesUtils.java
License:Apache License
static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath);/*from w w w .j a v a2 s . c o m*/ SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); try { long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN); for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>( filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(freqWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; IntWritable key = record.getFirst(); LongWritable value = record.getSecond(); if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } featureCount++; Long[] counts = { featureCount, vectorCount }; return new Pair<Long[], List<Path>>(counts, chunkPaths); } finally { Closeables.close(freqWriter, false); } }
From source file:mlbench.bayes.BayesUtils.java
License:Apache License
static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath);/* w ww. j a v a 2 s . co m*/ SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); try { long currentChunkSize = 0; Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN); int i = 0; for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(dictWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath); dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); currentChunkSize = 0; } Writable key = record.getFirst(); int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8; currentChunkSize += fieldSize; dictWriter.append(key, new IntWritable(i++)); } maxTermDimension[0] = i; } finally { Closeables.close(dictWriter, false); } return chunkPaths; }
From source file:org.gpfvic.mahout.cf.taste.hadoop.TasteHadoopUtils.java
License:Apache License
/** * Reads a binary mapping file//from ww w. j a va 2s. c o m */ public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) { OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap(); Path itemIDIndexPath = new Path(idIndexPathStr); for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>( itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { indexIDMap.put(record.getFirst().get(), record.getSecond().get()); } return indexIDMap; }