Example usage for org.apache.mahout.utils.vectors.lucene TFDFMapper TFDFMapper

Introduction

In this page you can find the example usage for org.apache.mahout.utils.vectors.lucene TFDFMapper TFDFMapper.

Prototype

public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo)

Source Link

Usage

From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java

License:Apache License

private void cluster(SolrIndexSearcher searcher, int k) {
    log.info("Clustering");
    //go and do the clustering.  First, we need to export the fields
    SchemaField keyField = searcher.getSchema().getUniqueKeyField();
    //TODO: should we prevent overlaps here if there are too many commits?  Clustering isn't something that has to be fresh all the time
    // and we likely can't sustain that anyway.
    if (keyField != null) {//we must have a key field
        //do this part synchronously here, and then spawn off a thread to do the clustering, otherwise it will take too long
        String idName = keyField.getName();
        Weight weight = new TFIDF();
        SolrIndexReader reader = searcher.getReader();
        try {/*from w w w  .  ja  v a  2s  .com*/
            TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
            LuceneIterable li = new LuceneIterable(reader, idName, inputField,
                    new TFDFMapper(reader, weight, termInfo));
            Date now = new Date();
            String jobDir = clusterBaseDir.getAbsolutePath() + File.separator + "clusters-" + now.getTime();
            log.info("Dumping {} to {}", inputField, clusterBaseDir);
            File outFile = new File(jobDir, "index-" + inputField + ".vec");
            VectorWriter vectorWriter = getSeqFileWriter(outFile.getAbsolutePath());
            long numDocs = vectorWriter.write(li, Integer.MAX_VALUE);
            vectorWriter.close();
            log.info("Wrote: {} vectors", numDocs);
            File dictOutFile = new File(jobDir, "dict-" + inputField + ".txt");
            log.info("Dictionary Output file: {}", dictOutFile);
            BufferedWriter writer = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
            JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, "\t", inputField);
            tiWriter.write(termInfo);
            tiWriter.close();
            writer.close();
            //OK, the dictionary is dumped, now we can cluster, do this via a thread in the background.
            //when it's done, we can switch to it
            ClusterJob clusterJob = new ClusterJob(k, jobDir, new Path(outFile.getAbsolutePath()),
                    new Path(jobDir + File.separator + "clusters"),
                    new Path(jobDir + File.separator + "output"), new Path(dictOutFile.getAbsolutePath()));

            writeJobDetails(clusterJob);
            theFuture = execService.submit(new ClusterCallable(clusterJob));
        } catch (IOException e) {
            log.error("Exception", e);
        }
    }

}