com.tamingtext.fuzzy.OverlapMeasures.java Source code

Java tutorial

Introduction

Here is the source code for com.tamingtext.fuzzy.OverlapMeasures.java

Source

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */

package com.tamingtext.fuzzy;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.PatternTokenizer;

import java.io.IOException;
import java.io.Reader;
import java.util.Collections;
import java.util.regex.Pattern;

public class OverlapMeasures {

    //<start id="jaccard_end"/>
    public float jaccard(char[] s, char[] t) {
        int intersection = 0;
        int union = s.length + t.length;
        boolean[] sdup = new boolean[s.length];
        union -= findDuplicates(s, sdup); //<co id="co_fuzzy_jaccard_dups1"/>
        boolean[] tdup = new boolean[t.length];
        union -= findDuplicates(t, tdup);
        for (int si = 0; si < s.length; si++) {
            if (!sdup[si]) { //<co id="co_fuzzy_jaccard_skip1"/>
                for (int ti = 0; ti < t.length; ti++) {
                    if (!tdup[ti]) {
                        if (s[si] == t[ti]) { //<co id="co_fuzzy_jaccard_intersection" />
                            intersection++;
                            break;
                        }
                    }
                }
            }
        }
        union -= intersection;
        return (float) intersection / union; //<co id="co_fuzzy_jaccard_return"/>
    }

    private int findDuplicates(char[] s, boolean[] sdup) {
        int ndup = 0;
        for (int si = 0; si < s.length; si++) {
            if (sdup[si]) {
                ndup++;
            } else {
                for (int si2 = si + 1; si2 < s.length; si2++) {
                    if (!sdup[si2]) {
                        sdup[si2] = s[si] == s[si2];
                    }
                }
            }
        }
        return ndup;
    }
    /*
    <calloutlist>
    <callout arearefs="co_fuzzy_jaccard_dups1"><para>Find duplicates and subtract from union.</para></callout>
    <callout arearefs="co_fuzzy_jaccard_skip1"><para>Skip duplicates.</para></callout>
    <callout arearefs="co_fuzzy_jaccard_intersection"><para>Find intersection.</para></callout>
    <callout arearefs="co_fuzzy_jaccard_return"><para>Return Jaccard distance.</para></callout>
    </calloutlist>
     */
    //<end id="jaccard_end"/>

    public TopDocs cosine(String queryTerm, int n, String... terms) throws IOException, ParseException {
        Directory directory = new RAMDirectory();
        final Pattern pattern = Pattern.compile(".");
        Analyzer analyzer = new Analyzer() {
            @Override
            public TokenStream tokenStream(String fieldName, Reader reader) {
                TokenStream result = null;
                try {
                    result = new PatternTokenizer(reader, pattern, 0);
                } catch (IOException e) {
                }
                return result;
            }
        };
        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter writer = new IndexWriter(directory, conf);
        for (String term : terms) {
            Document doc = new Document();
            doc.add(new Field("chars", term, Field.Store.YES, Field.Index.ANALYZED));
            writer.addDocument(doc);
        }
        writer.close();
        IndexReader reader = IndexReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), terms.length);
        for (int i = 0; i < topDocs.scoreDocs.length; i++) {
            System.out.println("Id: " + topDocs.scoreDocs[i].doc + " Val: "
                    + searcher.doc(topDocs.scoreDocs[i].doc).get("chars"));
        }
        QueryParser qp = new QueryParser(Version.LUCENE_36, "chars", analyzer);
        Query query = qp.parse(queryTerm);
        return searcher.search(query, n);
    }
}