Example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable

List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable.

Prototype

public SequenceFileDirIterable(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering,
        boolean reuseKeyValueInstances, Configuration conf) 

Source Link

Usage

From source file:ac.keio.sslab.nlp.lda.RowIdJob.java

License:Apache License

@SuppressWarnings("deprecation")
@Override//w ww .  j ava 2  s  .  com
public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Path outputPath = getOutputPath();
    Path indexPath = new Path(outputPath, "docIndex");
    Path matrixPath = new Path(outputPath, "matrix");

    try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, IntWritable.class,
            Text.class);
            SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath,
                    IntWritable.class, VectorWritable.class)) {
        IntWritable docId = new IntWritable();
        int i = 0;
        int numCols = 0;
        for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>(
                getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) {
            VectorWritable value = record.getSecond();
            docId.set(i);
            indexWriter.append(docId, record.getFirst());
            matrixWriter.append(docId, value);
            i++;
            numCols = value.get().size();
        }

        log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath);
        return 0;
    }
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * read the feature frequency List which is built at the end of the Parallel counting job
 * /*from   w w w .  ja v  a 2 s.  c  o m*/
 * @return Feature Frequency List
 */
public static List<Pair<String, Long>> readFList(Parameters params) {
    int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3"));
    Configuration conf = new Configuration();

    Path parallelCountingPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING);

    PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
            new Comparator<Pair<String, Long>>() {
                @Override
                public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                    int ret = o2.getSecond().compareTo(o1.getSecond());
                    if (ret != 0) {
                        return ret;
                    }
                    return o1.getFirst().compareTo(o2.getFirst());
                }
            });

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        if (value >= minSupport) {
            queue.add(new Pair<String, Long>(record.getFirst().toString(), value));
        }
    }
    List<Pair<String, Long>> fList = Lists.newArrayList();
    while (!queue.isEmpty()) {
        fList.add(queue.poll());
    }
    return fList;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

/**
 * @param topicModelStateTemp//from  w w w . j  av  a 2  s.c  o  m
 * @param iteration
 * @return {@code double[2]} where first value is perplexity and second is model weight of those
 *         documents sampled during perplexity computation, or {@code null} if no perplexity data
 *         exists for the given iteration.
 * @throws IOException
 */
public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration)
        throws IOException {
    Path perplexityPath = perplexityPath(topicModelStateTemp, iteration);
    FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf);
    if (!fs.exists(perplexityPath)) {
        log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath);
        return Double.NaN;
    }
    double perplexity = 0;
    double modelWeight = 0;
    long n = 0;
    for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>(
            perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
        modelWeight += pair.getFirst().get();
        perplexity += pair.getSecond().get();
        n++;
    }
    log.info("Read {} entries with total perplexity {} and model weight {}",
            new Object[] { n, perplexity, modelWeight });
    return perplexity / modelWeight;
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
 * This will use constant memory and will run at the speed of your disk read
 *///from   w w  w.j a va  2s.  c  o m
private static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
            IntWritable.class);

    try {
        long currentChunkSize = 0;
        Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN);
        int i = 0;
        for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern,
                PathType.GLOB, null, null, true, conf)) {
            if (currentChunkSize > chunkSizeLimit) {
                Closeables.closeQuietly(dictWriter);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0;
            }

            Writable key = record.getFirst();
            int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
            currentChunkSize += fieldSize;
            dictWriter.append(key, new IntWritable(i++));
        }
        maxTermDimension[0] = i;
    } finally {
        Closeables.closeQuietly(dictWriter);
    }

    return chunkPaths;
}

From source file:com.pocketx.gravity.common.TasteHadoopUtils.java

License:Apache License

/**
 * Reads a binary mapping file/*from  w  w w.ja  v a2 s.c o m*/
 */
public static OpenIntLongHashMap readItemIDIndexMap(String itemIDIndexPathStr, Configuration conf) {
    OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap();
    Path itemIDIndexPath = new Path(itemIDIndexPathStr);
    for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>(
            itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
        indexItemIDMap.put(record.getFirst().get(), record.getSecond().get());
    }
    return indexItemIDMap;
}

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * Read the header table which is built at the end of the Parallel counting
 * job./*from w ww.j  av a2 s .  c  o m*/
 * 
 * @return header table
 */
public static List<Pair<String, Long>> readFList(Parameters params) {
    Configuration conf = new Configuration();

    Path parallelCountingPath = new Path(params.get(OUTPUT), ITEM_FREQ);

    PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
            new Comparator<Pair<String, Long>>() {

                public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                    int ret = o2.getSecond().compareTo(o1.getSecond());
                    if (ret != 0) {
                        return ret;
                    }
                    return o1.getFirst().compareTo(o2.getFirst());
                }
            });

    /**
     * Get absolute support from relative threshold
     */
    Long numTrans = null;

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        String feature = record.getFirst().toString();
        if (feature.compareTo("dataset") == 0) {
            numTrans = value;
            break;
        }

    }

    Double relativeSupport = Double.valueOf(params.get(MIN_SUPPORT, "0.9"));
    absSupport = (int) Math.ceil((relativeSupport * numTrans));

    log.info("# Transactions: " + numTrans);
    log.info("Support: " + relativeSupport * 100 + "%");
    log.info("Support count: " + absSupport);
    params.set(MIN_SUPPORT, (new Long(absSupport)).toString());

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        if (value >= absSupport) {
            queue.add(new Pair<String, Long>(record.getFirst().toString(), value));
        }
    }

    List<Pair<String, Long>> fList = Lists.newArrayList();
    while (!queue.isEmpty()) {
        fList.add(queue.poll());
    }
    return fList;
}

From source file:mlbench.bayes.BayesUtils.java

License:Apache License

static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();
    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);/*from   w  w w .j  a  v  a2 s  .  c o m*/
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    try {
        long currentChunkSize = 0;
        long featureCount = 0;
        long vectorCount = Long.MAX_VALUE;
        Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
        for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>(
                filesPattern, PathType.GLOB, null, null, true, conf)) {

            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(freqWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            IntWritable key = record.getFirst();
            LongWritable value = record.getSecond();
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
        featureCount++;
        Long[] counts = { featureCount, vectorCount };
        return new Pair<Long[], List<Path>>(counts, chunkPaths);
    } finally {
        Closeables.close(freqWriter, false);
    }
}

From source file:mlbench.bayes.BayesUtils.java

License:Apache License

static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf,
        int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);/*  w ww. j a  v a 2 s . co  m*/

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
            IntWritable.class);

    try {
        long currentChunkSize = 0;
        Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN);
        int i = 0;
        for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern,
                PathType.GLOB, null, null, true, conf)) {
            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(dictWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0;
            }

            Writable key = record.getFirst();
            int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
            currentChunkSize += fieldSize;
            dictWriter.append(key, new IntWritable(i++));
        }
        maxTermDimension[0] = i;
    } finally {
        Closeables.close(dictWriter, false);
    }

    return chunkPaths;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.TasteHadoopUtils.java

License:Apache License

/**
 * Reads a binary mapping file//from  ww w. j  a va 2s.  c  o m
 */
public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) {
    OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap();
    Path itemIDIndexPath = new Path(idIndexPathStr);
    for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>(
            itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
        indexIDMap.put(record.getFirst().get(), record.getSecond().get());
    }
    return indexIDMap;
}