Example usage for org.apache.mahout.utils.vectors.lucene CachedTermInfo CachedTermInfo

List of usage examples for org.apache.mahout.utils.vectors.lucene CachedTermInfo CachedTermInfo

Introduction

In this page you can find the example usage for org.apache.mahout.utils.vectors.lucene CachedTermInfo CachedTermInfo.

Prototype

public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException 

Source Link

Usage

From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java

License:Apache License

private void cluster(SolrIndexSearcher searcher, int k) {
    log.info("Clustering");
    //go and do the clustering.  First, we need to export the fields
    SchemaField keyField = searcher.getSchema().getUniqueKeyField();
    //TODO: should we prevent overlaps here if there are too many commits?  Clustering isn't something that has to be fresh all the time
    // and we likely can't sustain that anyway.
    if (keyField != null) {//we must have a key field
        //do this part synchronously here, and then spawn off a thread to do the clustering, otherwise it will take too long
        String idName = keyField.getName();
        Weight weight = new TFIDF();
        SolrIndexReader reader = searcher.getReader();
        try {/*from   w w  w  .j  av a 2s  .c  om*/
            TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
            LuceneIterable li = new LuceneIterable(reader, idName, inputField,
                    new TFDFMapper(reader, weight, termInfo));
            Date now = new Date();
            String jobDir = clusterBaseDir.getAbsolutePath() + File.separator + "clusters-" + now.getTime();
            log.info("Dumping {} to {}", inputField, clusterBaseDir);
            File outFile = new File(jobDir, "index-" + inputField + ".vec");
            VectorWriter vectorWriter = getSeqFileWriter(outFile.getAbsolutePath());
            long numDocs = vectorWriter.write(li, Integer.MAX_VALUE);
            vectorWriter.close();
            log.info("Wrote: {} vectors", numDocs);
            File dictOutFile = new File(jobDir, "dict-" + inputField + ".txt");
            log.info("Dictionary Output file: {}", dictOutFile);
            BufferedWriter writer = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
            JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, "\t", inputField);
            tiWriter.write(termInfo);
            tiWriter.close();
            writer.close();
            //OK, the dictionary is dumped, now we can cluster, do this via a thread in the background.
            //when it's done, we can switch to it
            ClusterJob clusterJob = new ClusterJob(k, jobDir, new Path(outFile.getAbsolutePath()),
                    new Path(jobDir + File.separator + "clusters"),
                    new Path(jobDir + File.separator + "output"), new Path(dictOutFile.getAbsolutePath()));

            writeJobDetails(clusterJob);
            theFuture = execService.submit(new ClusterCallable(clusterJob));
        } catch (IOException e) {
            log.error("Exception", e);
        }
    }

}