Example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable.

Prototype

public SequenceFileDirIterable(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering,
        boolean reuseKeyValueInstances, Configuration conf)

Source Link

Usage

From source file:ac.keio.sslab.nlp.lda.RowIdJob.java

License:Apache License

@SuppressWarnings("deprecation")
@Override//w ww .  j ava 2  s  .  com
public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Path outputPath = getOutputPath();
    Path indexPath = new Path(outputPath, "docIndex");
    Path matrixPath = new Path(outputPath, "matrix");

    try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, IntWritable.class,
            Text.class);
            SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath,
                    IntWritable.class, VectorWritable.class)) {
        IntWritable docId = new IntWritable();
        int i = 0;
        int numCols = 0;
        for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>(
                getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) {
            VectorWritable value = record.getSecond();
            docId.set(i);
            indexWriter.append(docId, record.getFirst());
            matrixWriter.append(docId, value);
            i++;
            numCols = value.get().size();
        }

        log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath);
        return 0;
    }
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * read the feature frequency List which is built at the end of the Parallel counting job
 * /*from   w w w .  ja v  a 2 s.  c  o m*/
 * @return Feature Frequency List
 */
public static List<Pair<String, Long>> readFList(Parameters params) {
    int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3"));
    Configuration conf = new Configuration();

    Path parallelCountingPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING);

    PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
            new Comparator<Pair<String, Long>>() {
                @Override
                public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                    int ret = o2.getSecond().compareTo(o1.getSecond());
                    if (ret != 0) {
                        return ret;
                    }
                    return o1.getFirst().compareTo(o2.getFirst());
                }
            });

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        if (value >= minSupport) {
            queue.add(new Pair<String, Long>(record.getFirst().toString(), value));
        }
    }
    List<Pair<String, Long>> fList = Lists.newArrayList();
    while (!queue.isEmpty()) {
        fList.add(queue.poll());
    }
    return fList;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

/**
 * @param topicModelStateTemp//from  w w w . j  av  a 2  s.c  o  m
 * @param iteration
 * @return {@code double[2]} where first value is perplexity and second is model weight of those
 *         documents sampled during perplexity computation, or {@code null} if no perplexity data
 *         exists for the given iteration.
 * @throws IOException
 */
public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration)
        throws IOException {
    Path perplexityPath = perplexityPath(topicModelStateTemp, iteration);
    FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf);
    if (!fs.exists(perplexityPath)) {
        log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath);
        return Double.NaN;
    }
    double perplexity = 0;
    double modelWeight = 0;
    long n = 0;
    for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>(
            perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
        modelWeight += pair.getFirst().get();
        perplexity += pair.getSecond().get();
        n++;
    }
    log.info("Read {} entries with total perplexity {} and model weight {}",
            new Object[] { n, perplexity, modelWeight });
    return perplexity / modelWeight;
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
 * This will use constant memory and will run at the speed of your disk read
 *///from   w w  w.j a va  2s.  c  o m
private static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
            IntWritable.class);

    try {
        long currentChunkSize = 0;
        Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN);
        int i = 0;
        for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern,
                PathType.GLOB, null, null, true, conf)) {
            if (currentChunkSize > chunkSizeLimit) {
                Closeables.closeQuietly(dictWriter);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0;
            }

            Writable key = record.getFirst();
            int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
            currentChunkSize += fieldSize;
            dictWriter.append(key, new IntWritable(i++));
        }
        maxTermDimension[0] = i;
    } finally {
        Closeables.closeQuietly(dictWriter);
    }

    return chunkPaths;
}

From source file:com.pocketx.gravity.common.TasteHadoopUtils.java

License:Apache License

/**
 * Reads a binary mapping file/*from  w  w w.ja  v a2 s.c o m*/
 */
public static OpenIntLongHashMap readItemIDIndexMap(String itemIDIndexPathStr, Configuration conf) {
    OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap();
    Path itemIDIndexPath = new Path(itemIDIndexPathStr);
    for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>(
            itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
        indexItemIDMap.put(record.getFirst().get(), record.getSecond().get());
    }
    return indexItemIDMap;
}

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * Read the header table which is built at the end of the Parallel counting
 * job./*from w ww.j  av a2 s .  c  o m*/
 * 
 * @return header table
 */
public static List<Pair<String, Long>> readFList(Parameters params) {
    Configuration conf = new Configuration();

    Path parallelCountingPath = new Path(params.get(OUTPUT), ITEM_FREQ);

    PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
            new Comparator<Pair<String, Long>>() {

                public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                    int ret = o2.getSecond().compareTo(o1.getSecond());
                    if (ret != 0) {
                        return ret;
                    }
                    return o1.getFirst().compareTo(o2.getFirst());
                }
            });

    /**
     * Get absolute support from relative threshold
     */
    Long numTrans = null;

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        String feature = record.getFirst().toString();
        if (feature.compareTo("dataset") == 0) {
            numTrans = value;
            break;
        }

    }

    Double relativeSupport = Double.valueOf(params.get(MIN_SUPPORT, "0.9"));
    absSupport = (int) Math.ceil((relativeSupport * numTrans));

    log.info("# Transactions: " + numTrans);
    log.info("Support: " + relativeSupport * 100 + "%");
    log.info("Support count: " + absSupport);
    params.set(MIN_SUPPORT, (new Long(absSupport)).toString());

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        if (value >= absSupport) {
            queue.add(new Pair<String, Long>(record.getFirst().toString(), value));
        }
    }

    List<Pair<String, Long>> fList = Lists.newArrayList();
    while (!queue.isEmpty()) {
        fList.add(queue.poll());
    }
    return fList;
}

From source file:mlbench.bayes.BayesUtils.java

License:Apache License

static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();
    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);/*from   w  w w .j  a  v  a2 s  .  c o m*/
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    try {
        long currentChunkSize = 0;
        long featureCount = 0;
        long vectorCount = Long.MAX_VALUE;
        Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
        for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>(
                filesPattern, PathType.GLOB, null, null, true, conf)) {

            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(freqWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            IntWritable key = record.getFirst();
            LongWritable value = record.getSecond();
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
        featureCount++;
        Long[] counts = { featureCount, vectorCount };
        return new Pair<Long[], List<Path>>(counts, chunkPaths);
    } finally {
        Closeables.close(freqWriter, false);
    }
}

From source file:mlbench.bayes.BayesUtils.java

License:Apache License

static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf,
        int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);/*  w ww. j a  v a 2 s . co  m*/

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
            IntWritable.class);

    try {
        long currentChunkSize = 0;
        Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN);
        int i = 0;
        for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern,
                PathType.GLOB, null, null, true, conf)) {
            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(dictWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0;
            }

            Writable key = record.getFirst();
            int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
            currentChunkSize += fieldSize;
            dictWriter.append(key, new IntWritable(i++));
        }
        maxTermDimension[0] = i;
    } finally {
        Closeables.close(dictWriter, false);
    }

    return chunkPaths;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.TasteHadoopUtils.java

License:Apache License

/**
 * Reads a binary mapping file//from  ww w. j  a va 2s.  c  o m
 */
public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) {
    OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap();
    Path itemIDIndexPath = new Path(idIndexPathStr);
    for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>(
            itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
        indexIDMap.put(record.getFirst().get(), record.getSecond().get());
    }
    return indexIDMap;
}