com.rondhuit.w2v.lucene.LuceneIndexCorpus.java Source code

Introduction

Here is the source code for com.rondhuit.w2v.lucene.LuceneIndexCorpus.java
Source

/*
 *  Copyright (c) 2014 RONDHUIT Co.,Ltd.
 *  
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 *
 */

package com.rondhuit.w2v.lucene;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

import com.rondhuit.w2v.Config;
import com.rondhuit.w2v.Corpus;

public class LuceneIndexCorpus extends Corpus {

    private IndexReader reader;
    private final String field;
    private TopDocs topDocs;
    private final Analyzer analyzer;
    int tdPos;

    public LuceneIndexCorpus(Config config) throws IOException {
        super(config);

        LuceneIndexConfig liConfig = (LuceneIndexConfig) config;
        field = liConfig.getField();
        analyzer = loadAnalyzer(liConfig.getAnalyzer());
        Directory dir = FSDirectory.open(new File(liConfig.getIndexDir()));
        reader = DirectoryReader.open(dir);
    }

    static Analyzer loadAnalyzer(String fqcn) {
        try {
            return (Analyzer) Class.forName(fqcn).newInstance();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public LuceneIndexCorpus(Corpus cloneSrc) throws IOException {
        super(cloneSrc);

        LuceneIndexCorpus lic = (LuceneIndexCorpus) cloneSrc;

        config = lic.config;
        reader = lic.reader;
        field = lic.field;
        topDocs = lic.topDocs;
        analyzer = loadAnalyzer(((LuceneIndexConfig) config).getAnalyzer());
    }

    @Override
    public void learnVocab() throws IOException {
        super.learnVocab();

        final String field = ((LuceneIndexConfig) config).getField();
        final Terms terms = MultiFields.getTerms(reader, field);
        final BytesRef maxTerm = terms.getMax();
        final BytesRef minTerm = terms.getMin();
        Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true);
        IndexSearcher searcher = new IndexSearcher(reader);
        topDocs = searcher.search(q, Integer.MAX_VALUE);

        TermsEnum termsEnum = null;
        termsEnum = terms.iterator(termsEnum);

        termsEnum.seekCeil(new BytesRef());
        BytesRef term = termsEnum.term();
        while (term != null) {
            int p = addWordToVocab(term.utf8ToString());
            vocab[p].setCn((int) termsEnum.totalTermFreq());
            term = termsEnum.next();
        }
    }

    TokenStream tokenStream = null;
    CharTermAttribute termAtt = null;
    String[] values = new String[] {};
    int valPos = 0;

    @Override
    public void rewind(int numThreads, int id) throws IOException {
        super.rewind(numThreads, id);
        tdPos = topDocs.totalHits / numThreads * id;
    }

    @Override
    public String nextWord() throws IOException {

        while (true) {
            // check the tokenStream first
            if (tokenStream != null && tokenStream.incrementToken()) {
                return new String(termAtt.buffer(), 0, termAtt.length());
            }

            if (tokenStream != null)
                tokenStream.close();
            if (valPos < values.length) {
                tokenStream = analyzer.tokenStream(field, values[valPos++]);
                termAtt = tokenStream.getAttribute(CharTermAttribute.class);
                tokenStream.reset();
                eoc = false;
                return null;
            } else {
                if (tdPos >= topDocs.totalHits) {
                    tokenStream = null;
                    eoc = true;
                    return null; // end of index == end of corpus
                }
                Document doc = reader.document(topDocs.scoreDocs[tdPos++].doc);
                values = doc.getValues(field); // This method returns an empty array when there are no matching fields.
                                               // It never returns null.
                valPos = 0;
                tokenStream = null;
            }
        }
    }

    @Override
    public void close() throws IOException {
        reader.close();
    }
}