List of usage examples for org.apache.lucene.search.spell NGramDistance NGramDistance
public NGramDistance(int size)
From source file:br.bireme.ngrams.NGrams.java
/** * * @param index//from w ww . j a v a 2s .c o m * @param schema * @param inFile * @param inFileEncoding * @param outFile * @param outFileEncoding * @throws IOException * @throws ParseException */ public static void search(final NGIndex index, final NGSchema schema, final String inFile, final String inFileEncoding, final String outFile, final String outFileEncoding) throws IOException, ParseException { if (index == null) { throw new NullPointerException("index"); } if (schema == null) { throw new NullPointerException("schema"); } if (inFile == null) { throw new NullPointerException("inFile"); } if (inFileEncoding == null) { throw new NullPointerException("inFileEncoding"); } if (outFile == null) { throw new NullPointerException("outFile"); } if (outFileEncoding == null) { throw new NullPointerException("outFileEncoding"); } final Charset inCharset = Charset.forName(inFileEncoding); final Charset outCharset = Charset.forName(outFileEncoding); final IndexSearcher searcher = index.getIndexSearcher(); final NGAnalyzer analyzer = (NGAnalyzer) index.getAnalyzer(); final Parameters parameters = schema.getParameters(); final NGramDistance ngDistance = new NGramDistance(analyzer.getNgramSize()); final Set<String> id_id = new HashSet<>(); int cur = 0; try (final BufferedReader reader = Files.newBufferedReader(new File(inFile).toPath(), inCharset); final BufferedWriter writer = Files.newBufferedWriter(new File(outFile).toPath(), outCharset)) { writer.append("rank|similarity|search_doc_id|index_doc_id|" + "ngram_search_text|ngram_index_text|search_source|" + "index_source\n"); final Set<Result> results = new HashSet<>(); while (true) { final String line = reader.readLine(); if (line == null) { break; } if (++cur % 250 == 0) { System.out.println("<<< " + cur); } results.clear(); final String tline = line.replace(':', ' ').trim(); if (!tline.isEmpty()) { final String[] split = tline.split(" *\\| *", Integer.MAX_VALUE); if (split.length != parameters.nameFields.size()) { throw new IOException("invalid number of fields: " + line); } searchRaw(parameters, searcher, analyzer, ngDistance, tline, true, id_id, results); if (!results.isEmpty()) { writeOutput(parameters, results, writer); } } } searcher.getIndexReader().close(); } }
From source file:br.bireme.ngrams.NGrams.java
public static Set<String> search(final NGIndex index, final NGSchema schema, final String text, final boolean original) throws IOException, ParseException { if (index == null) { throw new NullPointerException("index"); }//w ww.j a va2s .c o m if (schema == null) { throw new NullPointerException("schema"); } if (text == null) { throw new NullPointerException("text"); } final IndexSearcher searcher = index.getIndexSearcher(); final NGAnalyzer analyzer = (NGAnalyzer) index.getAnalyzer(); final Parameters parameters = schema.getParameters(); final NGramDistance ngDistance = new NGramDistance(analyzer.getNgramSize()); final Set<String> id_id = new HashSet<>(); final Set<Result> results = new HashSet<>(); final String ttext = text.replace(':', ' ').trim(); final String[] split = ttext.split(" *\\| *", Integer.MAX_VALUE); if (split.length != parameters.nameFields.size()) { throw new IOException("invalid number of fields: " + text); } searchRaw(parameters, searcher, analyzer, ngDistance, ttext, true, id_id, results); searcher.getIndexReader().close(); return original ? results2pipeFull(parameters, results) : results2pipe(parameters, results); }
From source file:br.bireme.ngrams.NGrams.java
public static Set<String> srcWithoutSimil(final NGIndex index, final NGSchema schema, final String text, final boolean original) throws IOException, ParseException { if (index == null) { throw new NullPointerException("index"); }/*www .j a v a 2 s .c om*/ if (schema == null) { throw new NullPointerException("schema"); } if (text == null) { throw new NullPointerException("text"); } final IndexSearcher searcher = index.getIndexSearcher(); final NGAnalyzer analyzer = (NGAnalyzer) index.getAnalyzer(); final Parameters parameters = schema.getParameters(); final NGramDistance ngDistance = new NGramDistance(analyzer.getNgramSize()); final Set<String> id_id = new HashSet<>(); final Set<Result> results = new HashSet<>(); final String ttext = text.replace(':', ' ').trim(); final String[] split = ttext.split(" *\\| *", Integer.MAX_VALUE); if (split.length != parameters.nameFields.size()) { throw new IOException("invalid number of fields: " + text); } searchRaw(parameters, searcher, analyzer, ngDistance, ttext, false, id_id, results); searcher.getIndexReader().close(); return original ? results2pipeFull(parameters, results) : results2pipe(parameters, results); }
From source file:br.bireme.ngrams.NGrams.java
public static Set<String> searchJson(final NGIndex index, final NGSchema schema, final String text) throws IOException, ParseException { if (index == null) { throw new NullPointerException("index"); }//from w w w. j a v a2 s . c o m if (schema == null) { throw new NullPointerException("schema"); } if (text == null) { throw new NullPointerException("text"); } final IndexSearcher searcher = index.getIndexSearcher(); final NGAnalyzer analyzer = (NGAnalyzer) index.getAnalyzer(); final Parameters parameters = schema.getParameters(); final NGramDistance ngDistance = new NGramDistance(analyzer.getNgramSize()); final Set<String> id_id = new HashSet<>(); final TreeSet<Result> results = new TreeSet<>(); final String ttext = text.replace(':', ' ').trim(); searchRaw(parameters, searcher, analyzer, ngDistance, ttext, true, id_id, results); searcher.getIndexReader().close(); return results2json(parameters, results.descendingSet()); }
From source file:br.bireme.ngrams.Tools.java
public static float NGDistance(final String str1, final String str2) { return new NGramDistance(3).getDistance(str1, str2); }
From source file:ca.ualberta.entitylinking.common.indexing.AliasLuceneIndex.java
License:Open Source License
private List<Integer> rankingByNGramDistance(TopDocs td, String str) { NGramDistance measure = new NGramDistance(2); List<Rank<Double, Integer>> rankList = new ArrayList<Rank<Double, Integer>>(); try {/* w w w .j a v a 2s .com*/ if (keyArray == null) keyArray = FieldCache.DEFAULT.getStrings(reader, "docID"); } catch (Exception e) { e.printStackTrace(); } for (int i = 0; i < td.scoreDocs.length; i++) { int docId = td.scoreDocs[i].doc; String alias = keyArray[docId]; double sim = measure.getDistance(alias, str); rankList.add(new Rank<Double, Integer>(sim, docId)); } Collections.sort(rankList); List<Integer> ret = new ArrayList<Integer>(); for (Rank<Double, Integer> rank : rankList) { ret.add(rank.obj); } return ret; }
From source file:ca.ualberta.entitylinking.utils.similarity.StringSim.java
License:Open Source License
/** * Compute the n-gram distance between two strings * @param s source// www . ja va 2 s . com * @param t target * @param n number of characters in each gram. * @return A similarity between 0 and 1. */ public static double ngram_distance(String s, String t, int n) { NGramDistance measure = new NGramDistance(n); return measure.getDistance(s, t); }