package wvec;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.commons.math3.ml.clustering.CentroidCluster;
import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

 @author Debasis
public class WordVecsIndexer {
    IndexWriter writer;
    Properties prop;
    String indexPath;

    static final public String FIELD_WORD_NAME = "wordname";
    static final public String FIELD_WORD_VEC = "wordvec";

    public WordVecsIndexer(String propFile) throws Exception {
        prop = new Properties();
        prop.load(new FileReader(propFile));
        indexPath = prop.getProperty("wvecs.index");

    public void writeIndex() throws Exception {

        IndexWriterConfig iwcfg = new IndexWriterConfig(new WhitespaceAnalyzer());

        writer = new IndexWriter(FSDirectory.open((new File(indexPath)).toPath()), iwcfg);

        String fileToRead = prop.getProperty("wvecs.txt");
        indexFile(new File(fileToRead));


    Document constructDoc(String id, String line) throws Exception {
        Document doc = new Document();
        doc.add(new Field(FIELD_WORD_NAME, id, Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field(FIELD_WORD_VEC, line, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
        return doc;

    void storeClusterInfo() throws Exception {
        int numClusters = Integer.parseInt(prop.getProperty("retrieve.vocabcluster.numclusters", "100"));
        String clusterInfoBaseDir = prop.getProperty("wvecs.clusterids.basedir");
        String clusterInfoDirPath = clusterInfoBaseDir + "/" + numClusters;

        File clusterInfoFile = new File(clusterInfoDirPath);
        if (clusterInfoFile.isDirectory() && clusterInfoFile.exists()) {
            System.out.println("Cluster info already exists...");

        // Create the directory...
        IndexWriterConfig iwcfg = new IndexWriterConfig(new WhitespaceAnalyzer());

        IndexWriter clusterInfoFileWriter = new IndexWriter(FSDirectory.open(clusterInfoFile.toPath()), iwcfg);
        clusterWordVecs(clusterInfoFileWriter, numClusters);

    void clusterWordVecs(IndexWriter clusterIndexWriter, int numClusters) throws Exception {
        // Index where word vectors are stored
        IndexReader reader = DirectoryReader.open(FSDirectory.open((new File(indexPath)).toPath()));
        int numDocs = reader.numDocs();
        KMeansPlusPlusClusterer<WordVec> clusterer = new KMeansPlusPlusClusterer<>(numClusters);
        List<WordVec> wordList = new ArrayList<>(numDocs);

        // Read every wvec and load in memory
        for (int i = 0; i < numDocs; i++) {
            Document doc = reader.document(i);
            WordVec wvec = new WordVec(doc.get(FIELD_WORD_VEC));

        // Call K-means clustering
        System.out.println("Clustering the entire vocabulary...");
        List<CentroidCluster<WordVec>> clusters = clusterer.cluster(wordList);

        // Save the cluster info
        System.out.println("Writing out cluster ids in Lucene index...");
        int clusterId = 0;
        for (CentroidCluster<WordVec> c : clusters) {
            List<WordVec> pointsInThisClusuter = c.getPoints();
            for (WordVec thisPoint : pointsInThisClusuter) {
                Document clusterInfo = constructDoc(thisPoint.word, String.valueOf(clusterId));


    void indexFile(File file) throws Exception {
        FileReader fr = new FileReader(file);
        BufferedReader br = new BufferedReader(fr);
        String line;

        final int batchSize = 10000;
        int count = 0;

        // Each line is word vector
        while ((line = br.readLine()) != null) {

            int firstSpaceIndex = line.indexOf(" ");
            String id = line.substring(0, firstSpaceIndex);
            Document luceneDoc = constructDoc(id, line);

            if (count % batchSize == 0) {
                System.out.println("Added " + count + " words...");


    /* Use this to index the output of word2vec, i.e. instead
       of loading the word vectors from an in-memory hashmap
       keyed by a word, use Lucene search to retrieve the vector
       given a word. This makes it possible to run this on
       a limited memory environment. */
    public static void main(String[] args) {
        if (args.length == 0) {
            args = new String[1];
            System.out.println("Usage: java WordVecsIndexer <prop-file>");
            args[0] = "init.properties";

        try {
            WordVecsIndexer wvIndexer = new WordVecsIndexer(args[0]);
        } catch (Exception ex) {
