Example usage for org.apache.mahout.common.iterator.sequencefile PathType GLOB

List of usage examples for org.apache.mahout.common.iterator.sequencefile PathType GLOB

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile PathType GLOB.

Prototype

PathType GLOB

To view the source code for org.apache.mahout.common.iterator.sequencefile PathType GLOB.

Click Source Link

Usage

From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.DistributedRowMatrix.java

License:Apache License

@Override
public Iterator<MatrixSlice> iterateAll() {
    try {/*  ww  w  . jav  a  2  s . c  om*/
        Path pathPattern = rowPath;
        if (FileSystem.get(conf).getFileStatus(rowPath).isDir()) {
            pathPattern = new Path(rowPath, "*");
        }
        return Iterators.transform(
                new SequenceFileDirIterator<IntWritable, VectorWritable>(pathPattern, PathType.GLOB,
                        PathFilters.logsCRCFilter(), null, true, conf),
                new Function<Pair<IntWritable, VectorWritable>, MatrixSlice>() {
                    @Override
                    public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) {
                        return new MatrixSlice(from.getSecond().get(), from.getFirst().get());
                    }
                });
    } catch (IOException ioe) {
        throw new IllegalStateException(ioe);
    }
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * read the feature frequency List which is built at the end of the Parallel counting job
 * // w  w  w . ja  va 2s .  c o  m
 * @return Feature Frequency List
 */
public static List<Pair<String, Long>> readFList(Parameters params) {
    int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3"));
    Configuration conf = new Configuration();

    Path parallelCountingPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING);

    PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
            new Comparator<Pair<String, Long>>() {
                @Override
                public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                    int ret = o2.getSecond().compareTo(o1.getSecond());
                    if (ret != 0) {
                        return ret;
                    }
                    return o1.getFirst().compareTo(o2.getFirst());
                }
            });

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        if (value >= minSupport) {
            queue.add(new Pair<String, Long>(record.getFirst().toString(), value));
        }
    }
    List<Pair<String, Long>> fList = Lists.newArrayList();
    while (!queue.isEmpty()) {
        fList.add(queue.poll());
    }
    return fList;
}

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {//from w  w  w . j ava2 s. c om
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java

@Override
public void performWork(Path inputDoc, Path outputDir) {
    try {//from w  w  w.j  av  a  2  s . c o m
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder docName = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            String value = pair.getSecond().toString();
            docName.append(key);
            StringTuple document;
            try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) {
                CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                document = new StringTuple();
                while (stream.incrementToken()) {
                    if (termAtt.length() > 0) {
                        document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                    }
                }
                stream.end();
            }
            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(outputDir, docName.toString());
        // overwrite old vector file
        ClusterFileService.FS.delete(tokenizedSeq, true);
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
 * This will use constant memory and will run at the speed of your disk read
 *///  w w w .  j a  va 2  s.  c o m
private static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
            IntWritable.class);

    try {
        long currentChunkSize = 0;
        Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN);
        int i = 0;
        for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern,
                PathType.GLOB, null, null, true, conf)) {
            if (currentChunkSize > chunkSizeLimit) {
                Closeables.closeQuietly(dictWriter);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0;
            }

            Writable key = record.getFirst();
            int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
            currentChunkSize += fieldSize;
            dictWriter.append(key, new IntWritable(i++));
        }
        maxTermDimension[0] = i;
    } finally {
        Closeables.closeQuietly(dictWriter);
    }

    return chunkPaths;
}

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * Read the header table which is built at the end of the Parallel counting
 * job./*w  w  w  .j  a va 2 s.  com*/
 * 
 * @return header table
 */
public static List<Pair<String, Long>> readFList(Parameters params) {
    Configuration conf = new Configuration();

    Path parallelCountingPath = new Path(params.get(OUTPUT), ITEM_FREQ);

    PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
            new Comparator<Pair<String, Long>>() {

                public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                    int ret = o2.getSecond().compareTo(o1.getSecond());
                    if (ret != 0) {
                        return ret;
                    }
                    return o1.getFirst().compareTo(o2.getFirst());
                }
            });

    /**
     * Get absolute support from relative threshold
     */
    Long numTrans = null;

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        String feature = record.getFirst().toString();
        if (feature.compareTo("dataset") == 0) {
            numTrans = value;
            break;
        }

    }

    Double relativeSupport = Double.valueOf(params.get(MIN_SUPPORT, "0.9"));
    absSupport = (int) Math.ceil((relativeSupport * numTrans));

    log.info("# Transactions: " + numTrans);
    log.info("Support: " + relativeSupport * 100 + "%");
    log.info("Support count: " + absSupport);
    params.set(MIN_SUPPORT, (new Long(absSupport)).toString());

    for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
            new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
        long value = record.getSecond().get();
        if (value >= absSupport) {
            queue.add(new Pair<String, Long>(record.getFirst().toString(), value));
        }
    }

    List<Pair<String, Long>> fList = Lists.newArrayList();
    while (!queue.isEmpty()) {
        fList.add(queue.poll());
    }
    return fList;
}

From source file:mlbench.bayes.BayesUtils.java

License:Apache License

static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();
    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);/*from   ww  w  .  j  a  va  2  s  . c  o  m*/
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    try {
        long currentChunkSize = 0;
        long featureCount = 0;
        long vectorCount = Long.MAX_VALUE;
        Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
        for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>(
                filesPattern, PathType.GLOB, null, null, true, conf)) {

            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(freqWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            IntWritable key = record.getFirst();
            LongWritable value = record.getSecond();
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
        featureCount++;
        Long[] counts = { featureCount, vectorCount };
        return new Pair<Long[], List<Path>>(counts, chunkPaths);
    } finally {
        Closeables.close(freqWriter, false);
    }
}

From source file:mlbench.bayes.BayesUtils.java

License:Apache License

static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf,
        int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);//w ww.  ja v  a  2  s.c o m

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
            IntWritable.class);

    try {
        long currentChunkSize = 0;
        Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN);
        int i = 0;
        for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern,
                PathType.GLOB, null, null, true, conf)) {
            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(dictWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0;
            }

            Writable key = record.getFirst();
            int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
            currentChunkSize += fieldSize;
            dictWriter.append(key, new IntWritable(i++));
        }
        maxTermDimension[0] = i;
    } finally {
        Closeables.close(dictWriter, false);
    }

    return chunkPaths;
}

From source file:org.conan.mymahout.clustering.streaming.tools.ClusterQualitySummarizer.java

License:Apache License

public int run(String[] args) throws IOException {
    if (!parseArgs(args)) {
        return -1;
    }//from  w ww  .jav a2  s .  c o  m

    Configuration conf = new Configuration();
    try {
        //      Configuration.dumpConfiguration(conf, new OutputStreamWriter(System.out));

        fileOut = new PrintWriter(new FileOutputStream(outputFile));
        fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
                + "distance.q4,count,is.train\n");

        // Reading in the centroids (both pairs, if they exist).
        List<Centroid> centroids;
        List<Centroid> centroidsCompare = null;
        if (mahoutKMeansFormat) {
            SequenceFileDirValueIterable<ClusterWritable> clusterIterable = new SequenceFileDirValueIterable<ClusterWritable>(
                    new Path(centroidFile), PathType.GLOB, conf);
            centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
        } else {
            SequenceFileDirValueIterable<CentroidWritable> centroidIterable = new SequenceFileDirValueIterable<CentroidWritable>(
                    new Path(centroidFile), PathType.GLOB, conf);
            centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
        }

        if (centroidCompareFile != null) {
            if (mahoutKMeansFormatCompare) {
                SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable = new SequenceFileDirValueIterable<ClusterWritable>(
                        new Path(centroidCompareFile), PathType.GLOB, conf);
                centroidsCompare = Lists
                        .newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
            } else {
                SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable = new SequenceFileDirValueIterable<CentroidWritable>(
                        new Path(centroidCompareFile), PathType.GLOB, conf);
                centroidsCompare = Lists.newArrayList(
                        IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
            }
        }

        // Reading in the "training" set.
        SequenceFileDirValueIterable<VectorWritable> trainIterable = new SequenceFileDirValueIterable<VectorWritable>(
                new Path(trainFile), PathType.GLOB, conf);
        Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
        Iterable<Vector> datapoints = trainDatapoints;

        printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
                new SquaredEuclideanDistanceMeasure()), "train");

        // Also adding in the "test" set.
        if (testFile != null) {
            SequenceFileDirValueIterable<VectorWritable> testIterable = new SequenceFileDirValueIterable<VectorWritable>(
                    new Path(testFile), PathType.GLOB, conf);
            Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);

            printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
                    new SquaredEuclideanDistanceMeasure()), "test");

            datapoints = Iterables.concat(trainDatapoints, testDatapoints);
        }

        // At this point, all train/test CSVs have been written. We now compute quality metrics.
        List<OnlineSummarizer> summaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroids,
                distanceMeasure);
        List<OnlineSummarizer> compareSummaries = null;
        if (centroidsCompare != null) {
            compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare,
                    distanceMeasure);
        }
        System.out.printf("[Dunn Index] First: %f",
                ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
        if (compareSummaries != null) {
            System.out.printf(" Second: %f\n",
                    ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
        } else {
            System.out.printf("\n");
        }
        System.out.printf("[Davies-Bouldin Index] First: %f",
                ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
        if (compareSummaries != null) {
            System.out.printf(" Second: %f\n",
                    ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
        } else {
            System.out.printf("\n");
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        Closeables.close(fileOut, false);
    }
    return 0;
}

From source file:tk.summerway.mahout9.tools.MyClusterDumper.java

License:Apache License

public void printClusters(String[] dictionary) throws Exception {
    Configuration conf = new Configuration();

    if (this.termDictionary != null) {
        if ("text".equals(dictionaryFormat)) {
            dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
        } else if ("sequencefile".equals(dictionaryFormat)) {
            dictionary = VectorHelper.loadTermDictionary(conf, this.termDictionary);
        } else {//from w ww.j a  v  a2  s. co m
            throw new IllegalArgumentException("Invalid dictionary format");
        }
    }

    Writer writer;
    boolean shouldClose;
    if (this.outputFile == null) {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    } else {
        shouldClose = true;
        if (outputFile.getName().startsWith("s3n://")) {
            Path p = outputPath;
            FileSystem fs = FileSystem.get(p.toUri(), conf);
            writer = new OutputStreamWriter(fs.create(p), Charsets.UTF_8);
        } else {
            Files.createParentDirs(outputFile);
            writer = Files.newWriter(this.outputFile, Charsets.UTF_8);
        }
    }
    ClusterWriter clusterWriter = createClusterWriter(writer, dictionary);
    try {
        long numWritten = clusterWriter.write(new SequenceFileDirValueIterable<ClusterWritable>(
                new Path(seqFileDir, "part-*"), PathType.GLOB, conf));

        writer.flush();
        if (runEvaluation) {
            MyClusterEvaluator ce = new MyClusterEvaluator(pointsDir.toString(), seqFileDir.toString(),
                    "~/cluster_evaluate_result.txt", measure, 1000L);
            ce.evaluateClusters(conf);
        }
        //            if (runEvaluation) {
        //                HadoopUtil.delete(conf, new Path("tmp/representative"));
        //                int numIters = 5;
        //                RepresentativePointsDriver.main(new String[] { "--input",
        //                        seqFileDir.toString(), "--output",
        //                        "tmp/representative", "--clusteredPoints",
        //                        pointsDir.toString(), "--distanceMeasure",
        //                        measure.getClass().getName(), "--maxIter",
        //                        String.valueOf(numIters) });
        //                conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY,
        //                        measure.getClass().getName());
        //                conf.set(RepresentativePointsDriver.STATE_IN_KEY,
        //                        "tmp/representative/representativePoints-" + numIters);
        //                ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir);
        //                writer.append("\n");
        //                writer.append("Inter-Cluster Density: ")
        //                        .append(String.valueOf(ce.interClusterDensity()))
        //                        .append("\n");
        //                writer.append("Intra-Cluster Density: ")
        //                        .append(String.valueOf(ce.intraClusterDensity()))
        //                        .append("\n");
        //                CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir);
        //                writer.append("CDbw Inter-Cluster Density: ")
        //                        .append(String.valueOf(cdbw.interClusterDensity()))
        //                        .append("\n");
        //                writer.append("CDbw Intra-Cluster Density: ")
        //                        .append(String.valueOf(cdbw.intraClusterDensity()))
        //                        .append("\n");
        //                writer.append("CDbw Separation: ")
        //                        .append(String.valueOf(cdbw.separation())).append("\n");
        //                writer.flush();
        //            }
        log.info("Wrote {} clusters", numWritten);
    } finally {
        if (shouldClose) {
            Closeables.close(clusterWriter, false);
        } else {
            if (clusterWriter instanceof GraphMLClusterWriter) {
                clusterWriter.close();
            }
        }
    }
}