wvec.WordVecsIndexer.java Source code

Introduction

Here is the source code for wvec.WordVecsIndexer.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package wvec;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.commons.math3.ml.clustering.CentroidCluster;
import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 *
 * @author Debasis
 */
public class WordVecsIndexer {
    IndexWriter writer;
    Properties prop;
    String indexPath;

    static final public String FIELD_WORD_NAME = "wordname";
    static final public String FIELD_WORD_VEC = "wordvec";

    public WordVecsIndexer(String propFile) throws Exception {
        prop = new Properties();
        prop.load(new FileReader(propFile));
        indexPath = prop.getProperty("wvecs.index");
    }

    public void writeIndex() throws Exception {

        IndexWriterConfig iwcfg = new IndexWriterConfig(new WhitespaceAnalyzer());
        iwcfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        writer = new IndexWriter(FSDirectory.open((new File(indexPath)).toPath()), iwcfg);

        String fileToRead = prop.getProperty("wvecs.txt");
        indexFile(new File(fileToRead));
        writer.close();

        storeClusterInfo();
    }

    Document constructDoc(String id, String line) throws Exception {
        Document doc = new Document();
        doc.add(new Field(FIELD_WORD_NAME, id, Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field(FIELD_WORD_VEC, line, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
        return doc;
    }

    void storeClusterInfo() throws Exception {
        int numClusters = Integer.parseInt(prop.getProperty("retrieve.vocabcluster.numclusters", "100"));
        String clusterInfoBaseDir = prop.getProperty("wvecs.clusterids.basedir");
        String clusterInfoDirPath = clusterInfoBaseDir + "/" + numClusters;

        File clusterInfoFile = new File(clusterInfoDirPath);
        if (clusterInfoFile.isDirectory() && clusterInfoFile.exists()) {
            System.out.println("Cluster info already exists...");
            return;
        }

        // Create the directory...
        clusterInfoFile.mkdir();
        IndexWriterConfig iwcfg = new IndexWriterConfig(new WhitespaceAnalyzer());
        iwcfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter clusterInfoFileWriter = new IndexWriter(FSDirectory.open(clusterInfoFile.toPath()), iwcfg);
        clusterWordVecs(clusterInfoFileWriter, numClusters);
        clusterInfoFileWriter.close();
    }

    void clusterWordVecs(IndexWriter clusterIndexWriter, int numClusters) throws Exception {
        // Index where word vectors are stored
        IndexReader reader = DirectoryReader.open(FSDirectory.open((new File(indexPath)).toPath()));
        int numDocs = reader.numDocs();
        KMeansPlusPlusClusterer<WordVec> clusterer = new KMeansPlusPlusClusterer<>(numClusters);
        List<WordVec> wordList = new ArrayList<>(numDocs);

        // Read every wvec and load in memory
        for (int i = 0; i < numDocs; i++) {
            Document doc = reader.document(i);
            WordVec wvec = new WordVec(doc.get(FIELD_WORD_VEC));
            wordList.add(wvec);
        }

        // Call K-means clustering
        System.out.println("Clustering the entire vocabulary...");
        List<CentroidCluster<WordVec>> clusters = clusterer.cluster(wordList);

        // Save the cluster info
        System.out.println("Writing out cluster ids in Lucene index...");
        int clusterId = 0;
        for (CentroidCluster<WordVec> c : clusters) {
            List<WordVec> pointsInThisClusuter = c.getPoints();
            for (WordVec thisPoint : pointsInThisClusuter) {
                Document clusterInfo = constructDoc(thisPoint.word, String.valueOf(clusterId));
                clusterIndexWriter.addDocument(clusterInfo);
            }
            clusterId++;
        }

        reader.close();
    }

    void indexFile(File file) throws Exception {
        FileReader fr = new FileReader(file);
        BufferedReader br = new BufferedReader(fr);
        String line;

        final int batchSize = 10000;
        int count = 0;

        // Each line is word vector
        while ((line = br.readLine()) != null) {

            int firstSpaceIndex = line.indexOf(" ");
            String id = line.substring(0, firstSpaceIndex);
            Document luceneDoc = constructDoc(id, line);

            if (count % batchSize == 0) {
                System.out.println("Added " + count + " words...");
            }

            writer.addDocument(luceneDoc);
            count++;
        }
        br.close();
        fr.close();
    }

    /* Use this to index the output of word2vec, i.e. instead
       of loading the word vectors from an in-memory hashmap
       keyed by a word, use Lucene search to retrieve the vector
       given a word. This makes it possible to run this on
       a limited memory environment. */
    public static void main(String[] args) {
        if (args.length == 0) {
            args = new String[1];
            System.out.println("Usage: java WordVecsIndexer <prop-file>");
            args[0] = "init.properties";
        }

        try {
            WordVecsIndexer wvIndexer = new WordVecsIndexer(args[0]);
            wvIndexer.writeIndex();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

}