org.nlp4l.lucene.LuceneDocTermVector.java Source code

Java tutorial

Introduction

Here is the source code for org.nlp4l.lucene.LuceneDocTermVector.java

Source

/*
 * Copyright 2015 RONDHUIT Co.,LTD.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.nlp4l.lucene;

import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

/**
 * Lucene????
 * 
 * @since 0.2
 */
public class LuceneDocTermVector {

    private final TermWeightQueue queue;

    /**
     * Lucene????
     * 
     * @param reader ??Lucene???{@link IndexReader}
     * @param docId ???LuceneID
     * @param fieldName ???Lucene??
     * @param size ?
     * @throws IOException
     */
    public LuceneDocTermVector(IndexReader reader, int docId, String fieldName, int size) throws IOException {
        this(reader, docId, fieldName, size, null, null, null, null);
    }

    /**
     * Lucene????
     * 
     * @param reader ??Lucene???{@link IndexReader}
     * @param docId ???LuceneID
     * @param fieldName ???Lucene??
     * @param size ?
     * @param termsReuse null?????
     * @param twf null???{@link DefaultTfIdfTermWeightFactory}???
     * @throws IOException
     */
    public LuceneDocTermVector(IndexReader reader, int docId, String fieldName, int size, Terms termsReuse,
            TermWeightFactory twf) throws IOException {
        this(reader, docId, fieldName, size, termsReuse, null, twf, null);
    }

    /**
     * Lucene????
     * 
     * @param reader ??Lucene???{@link IndexReader}
     * @param docId ???LuceneID
     * @param fieldName ???Lucene??
     * @param size ?
     * @param termsReuse null?????
     * @param liveDocs null?????
     * @throws IOException
     */
    public LuceneDocTermVector(IndexReader reader, int docId, String fieldName, int size, Terms termsReuse,
            Bits liveDocs) throws IOException {
        this(reader, docId, fieldName, size, termsReuse, liveDocs, null, null);
    }

    /**
     * Lucene????
     * 
     * @param reader ??Lucene???{@link IndexReader}
     * @param docId ???LuceneID
     * @param fieldName ???Lucene??
     * @param size ?
     * @param termsReuse null?????
     * @param liveDocs null?????
     * @param twf null???{@link DefaultTfIdfTermWeightFactory}???
     * @param stopWords ??????????????????null????
     * @throws IOException
     */
    public LuceneDocTermVector(IndexReader reader, int docId, String fieldName, int size, Terms termsReuse,
            Bits liveDocs, TermWeightFactory twf, Set<String> stopWords) throws IOException {
        liveDocs = liveDocs == null ? MultiFields.getLiveDocs(reader) : liveDocs;
        twf = twf == null ? new DefaultTfIdfTermWeightFactory(reader, docId, fieldName, liveDocs) : twf;
        queue = new TermWeightQueue(size);

        if (termsReuse == null)
            termsReuse = reader.getTermVector(docId, fieldName);
        TermsEnum termsEnum = termsReuse.iterator();
        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            // candidate feature term
            final String term = text.utf8ToString();
            if (stopWords != null && stopWords.contains(term))
                continue;
            final TermWeight termWeight = twf.create(text);
            if (termWeight == null)
                continue;

            Map.Entry<String, TermWeight> entry = new Map.Entry<String, TermWeight>() {
                public String getKey() {
                    return term;
                }

                public TermWeight getValue() {
                    return termWeight;
                }

                public TermWeight setValue(TermWeight arg0) {
                    // TODO Auto-generated method stub
                    return null;
                }
            };
            queue.insertWithOverflow(entry);
        }
    }

    public TermWeightQueue getResultQueue() {
        return queue;
    }

    /**
     * Lucene??????????
     * 
     * @since 0.2
     *
     */
    public static interface TermWeight {
        public float weight();
    }

    /**
     * Lucene?????tf*idf???{@link TermWeight}
     * 
     * @since 0.2
     *
     */
    protected static class DefaultTfIdfTermWeight implements TermWeight {

        private final int maxDoc, tf, docFreq;

        public DefaultTfIdfTermWeight(int maxDoc, int tf, int docFreq) {
            this.maxDoc = maxDoc;
            this.tf = tf;
            this.docFreq = docFreq;
        }

        public float weight() {
            return (float) (Math.sqrt(tf) * (1 + Math.log(maxDoc / (docFreq + 1))));
        }

        public int maxDoc() {
            return maxDoc;
        }

        public int tf() {
            return tf;
        }

        public int docFreq() {
            return docFreq;
        }
    }

    /**
     * {@link TermWeight}?????
     * 
     * @since 0.2
     */
    public static interface TermWeightFactory {
        public TermWeight create(BytesRef term) throws IOException;
    }

    /**
     * {@link TermWeight}???{@link DefaultTfIdfTermWeight}??{@link TermWeightFactory}?
     * 
     * @since 0.2
     *
     */
    public static class DefaultTfIdfTermWeightFactory implements TermWeightFactory {

        private final IndexReader reader;
        private final IndexSearcher searcher;
        private final int maxDoc;
        private final int docId;
        private final String fieldName;
        private final Bits liveDocs;

        public DefaultTfIdfTermWeightFactory(IndexReader reader, int docId, String fieldName, Bits liveDocs) {
            this.reader = reader;
            searcher = new IndexSearcher(reader);
            maxDoc = reader.maxDoc();
            this.docId = docId;
            this.fieldName = fieldName;
            this.liveDocs = liveDocs == null ? MultiFields.getLiveDocs(reader) : liveDocs;
        }

        public TermWeight create(BytesRef term) throws IOException {
            PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, fieldName, term);
            int d = docsEnum.advance(docId);
            if (d != docId) {
                throw new RuntimeException("wrong docId!");
            }
            final int tf = docsEnum.freq();
            final int docFreq = docFreq(term);
            return new DefaultTfIdfTermWeight(maxDoc, tf, docFreq);
        }

        protected int docFreq(BytesRef text) throws IOException {
            return docFreq(text.utf8ToString());
        }

        protected int docFreq(String text) throws IOException {
            return LuceneUtil.getTermDocFreq(searcher, fieldName, text);
        }
    }

    final static class TermWeightQueue extends PriorityQueue<Map.Entry<String, TermWeight>> {

        public TermWeightQueue(int maxSize) {
            super(maxSize);
        }

        // collect terms with larger weight
        protected boolean lessThan(Entry<String, TermWeight> a, Entry<String, TermWeight> b) {
            return a.getValue().weight() < b.getValue().weight();
        }
    }
}