Example usage for org.apache.lucene.search.spell NGramDistance NGramDistance

List of usage examples for org.apache.lucene.search.spell NGramDistance NGramDistance

Introduction

In this page you can find the example usage for org.apache.lucene.search.spell NGramDistance NGramDistance.

Prototype

public NGramDistance(int size) 

Source Link

Document

Creates an N-Gram distance measure using n-grams of the specified size.

Usage

From source file:br.bireme.ngrams.NGrams.java

/**
 *
 * @param index//from w ww . j  a v a  2s .c  o m
 * @param schema
 * @param inFile
 * @param inFileEncoding
 * @param outFile
 * @param outFileEncoding
 * @throws IOException
 * @throws ParseException
 */
public static void search(final NGIndex index, final NGSchema schema, final String inFile,
        final String inFileEncoding, final String outFile, final String outFileEncoding)
        throws IOException, ParseException {
    if (index == null) {
        throw new NullPointerException("index");
    }
    if (schema == null) {
        throw new NullPointerException("schema");
    }
    if (inFile == null) {
        throw new NullPointerException("inFile");
    }
    if (inFileEncoding == null) {
        throw new NullPointerException("inFileEncoding");
    }
    if (outFile == null) {
        throw new NullPointerException("outFile");
    }
    if (outFileEncoding == null) {
        throw new NullPointerException("outFileEncoding");
    }
    final Charset inCharset = Charset.forName(inFileEncoding);
    final Charset outCharset = Charset.forName(outFileEncoding);
    final IndexSearcher searcher = index.getIndexSearcher();
    final NGAnalyzer analyzer = (NGAnalyzer) index.getAnalyzer();
    final Parameters parameters = schema.getParameters();
    final NGramDistance ngDistance = new NGramDistance(analyzer.getNgramSize());
    final Set<String> id_id = new HashSet<>();
    int cur = 0;
    try (final BufferedReader reader = Files.newBufferedReader(new File(inFile).toPath(), inCharset);
            final BufferedWriter writer = Files.newBufferedWriter(new File(outFile).toPath(), outCharset)) {
        writer.append("rank|similarity|search_doc_id|index_doc_id|"
                + "ngram_search_text|ngram_index_text|search_source|" + "index_source\n");

        final Set<Result> results = new HashSet<>();
        while (true) {
            final String line = reader.readLine();
            if (line == null) {
                break;
            }
            if (++cur % 250 == 0) {
                System.out.println("<<< " + cur);
            }

            results.clear();
            final String tline = line.replace(':', ' ').trim();
            if (!tline.isEmpty()) {
                final String[] split = tline.split(" *\\| *", Integer.MAX_VALUE);
                if (split.length != parameters.nameFields.size()) {
                    throw new IOException("invalid number of fields: " + line);
                }
                searchRaw(parameters, searcher, analyzer, ngDistance, tline, true, id_id, results);
                if (!results.isEmpty()) {
                    writeOutput(parameters, results, writer);
                }
            }
        }
        searcher.getIndexReader().close();
    }
}

From source file:br.bireme.ngrams.NGrams.java

public static Set<String> search(final NGIndex index, final NGSchema schema, final String text,
        final boolean original) throws IOException, ParseException {
    if (index == null) {
        throw new NullPointerException("index");
    }//w  ww.j a va2s  .c  o  m
    if (schema == null) {
        throw new NullPointerException("schema");
    }
    if (text == null) {
        throw new NullPointerException("text");
    }
    final IndexSearcher searcher = index.getIndexSearcher();
    final NGAnalyzer analyzer = (NGAnalyzer) index.getAnalyzer();
    final Parameters parameters = schema.getParameters();
    final NGramDistance ngDistance = new NGramDistance(analyzer.getNgramSize());
    final Set<String> id_id = new HashSet<>();
    final Set<Result> results = new HashSet<>();

    final String ttext = text.replace(':', ' ').trim();
    final String[] split = ttext.split(" *\\| *", Integer.MAX_VALUE);
    if (split.length != parameters.nameFields.size()) {
        throw new IOException("invalid number of fields: " + text);
    }

    searchRaw(parameters, searcher, analyzer, ngDistance, ttext, true, id_id, results);

    searcher.getIndexReader().close();

    return original ? results2pipeFull(parameters, results) : results2pipe(parameters, results);
}

From source file:br.bireme.ngrams.NGrams.java

public static Set<String> srcWithoutSimil(final NGIndex index, final NGSchema schema, final String text,
        final boolean original) throws IOException, ParseException {
    if (index == null) {
        throw new NullPointerException("index");
    }/*www .j  a v  a 2  s  .c om*/
    if (schema == null) {
        throw new NullPointerException("schema");
    }
    if (text == null) {
        throw new NullPointerException("text");
    }
    final IndexSearcher searcher = index.getIndexSearcher();
    final NGAnalyzer analyzer = (NGAnalyzer) index.getAnalyzer();
    final Parameters parameters = schema.getParameters();
    final NGramDistance ngDistance = new NGramDistance(analyzer.getNgramSize());
    final Set<String> id_id = new HashSet<>();
    final Set<Result> results = new HashSet<>();

    final String ttext = text.replace(':', ' ').trim();
    final String[] split = ttext.split(" *\\| *", Integer.MAX_VALUE);
    if (split.length != parameters.nameFields.size()) {
        throw new IOException("invalid number of fields: " + text);
    }

    searchRaw(parameters, searcher, analyzer, ngDistance, ttext, false, id_id, results);

    searcher.getIndexReader().close();

    return original ? results2pipeFull(parameters, results) : results2pipe(parameters, results);
}

From source file:br.bireme.ngrams.NGrams.java

public static Set<String> searchJson(final NGIndex index, final NGSchema schema, final String text)
        throws IOException, ParseException {
    if (index == null) {
        throw new NullPointerException("index");
    }//from  w  w  w.  j  a v  a2  s  . c  o m
    if (schema == null) {
        throw new NullPointerException("schema");
    }
    if (text == null) {
        throw new NullPointerException("text");
    }
    final IndexSearcher searcher = index.getIndexSearcher();
    final NGAnalyzer analyzer = (NGAnalyzer) index.getAnalyzer();
    final Parameters parameters = schema.getParameters();
    final NGramDistance ngDistance = new NGramDistance(analyzer.getNgramSize());
    final Set<String> id_id = new HashSet<>();
    final TreeSet<Result> results = new TreeSet<>();
    final String ttext = text.replace(':', ' ').trim();

    searchRaw(parameters, searcher, analyzer, ngDistance, ttext, true, id_id, results);
    searcher.getIndexReader().close();

    return results2json(parameters, results.descendingSet());
}

From source file:br.bireme.ngrams.Tools.java

public static float NGDistance(final String str1, final String str2) {
    return new NGramDistance(3).getDistance(str1, str2);
}

From source file:ca.ualberta.entitylinking.common.indexing.AliasLuceneIndex.java

License:Open Source License

private List<Integer> rankingByNGramDistance(TopDocs td, String str) {
    NGramDistance measure = new NGramDistance(2);
    List<Rank<Double, Integer>> rankList = new ArrayList<Rank<Double, Integer>>();

    try {/* w w w  .j  a v a  2s .com*/
        if (keyArray == null)
            keyArray = FieldCache.DEFAULT.getStrings(reader, "docID");
    } catch (Exception e) {
        e.printStackTrace();
    }

    for (int i = 0; i < td.scoreDocs.length; i++) {
        int docId = td.scoreDocs[i].doc;
        String alias = keyArray[docId];
        double sim = measure.getDistance(alias, str);
        rankList.add(new Rank<Double, Integer>(sim, docId));
    }

    Collections.sort(rankList);

    List<Integer> ret = new ArrayList<Integer>();
    for (Rank<Double, Integer> rank : rankList) {
        ret.add(rank.obj);
    }

    return ret;
}

From source file:ca.ualberta.entitylinking.utils.similarity.StringSim.java

License:Open Source License

/**
 * Compute the n-gram distance between two strings
 * @param s source// www  . ja va 2  s . com
 * @param t target
 * @param n number of characters in each gram.
 * @return A similarity between 0 and 1.
 */
public static double ngram_distance(String s, String t, int n) {
    NGramDistance measure = new NGramDistance(n);

    return measure.getDistance(s, t);

}