List of usage examples for org.apache.mahout.utils.vectors.lucene TFDFMapper TFDFMapper
public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo)
From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java
License:Apache License
private void cluster(SolrIndexSearcher searcher, int k) { log.info("Clustering"); //go and do the clustering. First, we need to export the fields SchemaField keyField = searcher.getSchema().getUniqueKeyField(); //TODO: should we prevent overlaps here if there are too many commits? Clustering isn't something that has to be fresh all the time // and we likely can't sustain that anyway. if (keyField != null) {//we must have a key field //do this part synchronously here, and then spawn off a thread to do the clustering, otherwise it will take too long String idName = keyField.getName(); Weight weight = new TFIDF(); SolrIndexReader reader = searcher.getReader(); try {/*from w w w . ja v a 2s .com*/ TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); LuceneIterable li = new LuceneIterable(reader, idName, inputField, new TFDFMapper(reader, weight, termInfo)); Date now = new Date(); String jobDir = clusterBaseDir.getAbsolutePath() + File.separator + "clusters-" + now.getTime(); log.info("Dumping {} to {}", inputField, clusterBaseDir); File outFile = new File(jobDir, "index-" + inputField + ".vec"); VectorWriter vectorWriter = getSeqFileWriter(outFile.getAbsolutePath()); long numDocs = vectorWriter.write(li, Integer.MAX_VALUE); vectorWriter.close(); log.info("Wrote: {} vectors", numDocs); File dictOutFile = new File(jobDir, "dict-" + inputField + ".txt"); log.info("Dictionary Output file: {}", dictOutFile); BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8"))); JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, "\t", inputField); tiWriter.write(termInfo); tiWriter.close(); writer.close(); //OK, the dictionary is dumped, now we can cluster, do this via a thread in the background. //when it's done, we can switch to it ClusterJob clusterJob = new ClusterJob(k, jobDir, new Path(outFile.getAbsolutePath()), new Path(jobDir + File.separator + "clusters"), new Path(jobDir + File.separator + "output"), new Path(dictOutFile.getAbsolutePath())); writeJobDetails(clusterJob); theFuture = execService.submit(new ClusterCallable(clusterJob)); } catch (IOException e) { log.error("Exception", e); } } }