List of usage examples for org.apache.lucene.search.spell NGramDistance getDistance
@Override
public float getDistance(String source, String target)
From source file:br.bireme.ngrams.NGrams.java
private static void searchRaw(final Parameters parameters, final IndexSearcher searcher, final NGAnalyzer analyzer, final NGramDistance ngDistance, final String text, final boolean useSimilarity, final Set<String> id_id, final Set<Result> results) throws IOException, ParseException { assert parameters != null; assert searcher != null; assert analyzer != null; assert ngDistance != null; assert id_id != null; assert results != null; if (text == null) { throw new NullPointerException("text"); }/*w ww . j ava 2s . c o m*/ final String text2 = StringEscapeUtils.unescapeHtml4(text); final String[] param = text2.trim().split(" *\\| *", Integer.MAX_VALUE); if (param.length != parameters.nameFields.size()) { throw new IOException(text); } final String fname = parameters.indexed.name; final QueryParser parser = new QueryParser(fname, analyzer); final String ntext = Tools .limitSize(Tools.normalize(param[parameters.indexed.pos], OCC_SEPARATOR), MAX_NG_TEXT_SIZE).trim(); final int MAX_RESULTS = 20; if (!ntext.isEmpty()) { final Query query = parser.parse(QueryParser.escape(ntext)); final TopDocs top = searcher.search(query, MAX_RESULTS); final float lower = parameters.scores.first().minValue; ScoreDoc[] scores = top.scoreDocs; int remaining = MAX_RESULTS; for (ScoreDoc sdoc : scores) { if (remaining-- <= 0) { break; // Only for performance } final Document doc = searcher.doc(sdoc.doc); if (useSimilarity) { final String dname = doc.get(fname); if (dname == null) { throw new IOException("dname"); } final float similarity = ngDistance.getDistance(ntext, doc.get(fname)); if (similarity < lower) { if (remaining > 3) { remaining = 3; //System.out.println("Atualizando tot=" + tot + " score=" + sdoc.score + " similarity=" + similarity+ " text=" + doc.get(fname)); } } else { final Result out = createResult(id_id, parameters, param, doc, ngDistance, similarity, sdoc.score); if (out != null) { results.add(out); } } } else { if (sdoc.score < 1.0) { System.out.println("Saindo score=" + sdoc.score); break; // Only for performance } final Result out = createResult(id_id, parameters, param, doc, ngDistance, 0, sdoc.score); if (out != null) { results.add(out); } } } } }
From source file:br.bireme.ngrams.NGrams.java
private static int compareNGramFields(final NGramDistance ngDistance, final br.bireme.ngrams.Field field, final String fld, final Document doc) { assert ngDistance != null; assert field != null; assert doc != null; final int ret; final String text = (String) doc.get(field.name); final String xfld = (fld == null) ? "" : fld.trim(); final String xtext = (text == null) ? "" : text.trim(); if (xfld.isEmpty() && xtext.isEmpty()) { ret = -1;/*from w w w. j a v a 2 s .com*/ } else { final float similarity = ngDistance.getDistance(xfld, xtext); if (similarity >= ((NGramField) field).minScore) { ret = 1; } else if (field.contentMatch == Status.MAX_SCORE) { ret = -2; } else { ret = -1; } } return ret; }
From source file:ca.ualberta.entitylinking.common.indexing.AliasLuceneIndex.java
License:Open Source License
private List<Integer> rankingByNGramDistance(TopDocs td, String str) { NGramDistance measure = new NGramDistance(2); List<Rank<Double, Integer>> rankList = new ArrayList<Rank<Double, Integer>>(); try {//from w w w . j a v a 2s. c om if (keyArray == null) keyArray = FieldCache.DEFAULT.getStrings(reader, "docID"); } catch (Exception e) { e.printStackTrace(); } for (int i = 0; i < td.scoreDocs.length; i++) { int docId = td.scoreDocs[i].doc; String alias = keyArray[docId]; double sim = measure.getDistance(alias, str); rankList.add(new Rank<Double, Integer>(sim, docId)); } Collections.sort(rankList); List<Integer> ret = new ArrayList<Integer>(); for (Rank<Double, Integer> rank : rankList) { ret.add(rank.obj); } return ret; }
From source file:ca.ualberta.entitylinking.utils.similarity.StringSim.java
License:Open Source License
/** * Compute the n-gram distance between two strings * @param s source//w w w .j a v a2s . c o m * @param t target * @param n number of characters in each gram. * @return A similarity between 0 and 1. */ public static double ngram_distance(String s, String t, int n) { NGramDistance measure = new NGramDistance(n); return measure.getDistance(s, t); }