List of usage examples for org.apache.lucene.search.highlight TokenSources getAnyTokenStream
@Deprecated public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer) throws IOException
From source file:ch.admin.isb.hermes5.business.search.HighlighterWrapper.java
License:Apache License
public String getHighlightedText(int id, String contentField) throws IOException { try {//from ww w .j a va 2 s. com Document hitDoc = isearcher.doc(id); String text = hitDoc.get(contentField); TokenStream tokenStream = TokenSources.getAnyTokenStream(isearcher.getIndexReader(), id, contentField, analyzer); String fragment = highlighter.getBestFragments(tokenStream, text, numberOfFragments, "..."); return trimFragment(fragment); } catch (InvalidTokenOffsetsException e) { throw new RuntimeException(e); } }
From source file:ci6226.eval_index_reader.java
public static void Searchit(IndexReader reader, IndexSearcher searcher, Analyzer _analyzer, String field, String[] _searchList, int _topn, PrintWriter writer) throws org.apache.lucene.queryparser.classic.ParseException, IOException, InvalidTokenOffsetsException { Analyzer analyzer = _analyzer;// w ww . ja v a 2 s . c om QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer); String[] testString = _searchList;//{"to","123","impressed","Geezer","geezer","semi-busy","\"eggs vegetable\"","gs veget","\"gs veget\""};//,"good","I","but","coffee"}; for (int j = 0; j < testString.length; j++) { String lstr = String.valueOf(j) + "," + testString[j]; Query query = parser.parse(testString[j]); System.out.println("Searching for: " + query.toString(field)); TopDocs topdocs = searcher.search(query, _topn); lstr += "," + topdocs.totalHits; ScoreDoc[] scoreDocs = topdocs.scoreDocs; SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query.rewrite(reader))); for (int i = 0; i < scoreDocs.length; i++) { int doc = scoreDocs[i].doc; Document document = searcher.doc(doc); // System.out.println("Snippet=" + document.get(field)); System.out.println(i); String text = document.get(field); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), doc, field, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); String line = ""; for (int m = 0; m < frag.length; m++) { if ((frag[m] != null) && (frag[m].getScore() > 0)) { System.out.println((frag[m].toString())); line = frag[m].toString(); line = line.replaceAll("\n", ""); line = line.replaceAll("\r", ""); line = line.replaceAll("\"", ""); line = line.replaceAll(",", " "); } } lstr += "," + line; lstr += "," + String.valueOf(scoreDocs[i].score); } writer.write(lstr + "\n"); System.out.println("Search for:" + testString[j] + " Total hits=" + scoreDocs.length); System.out.println("////////////////////////////////////////////////////"); } }
From source file:com.main.Searcher.java
public List<Bean> searching(String s1, String s2, String radioBtn) throws IOException, ParseException, InvalidTokenOffsetsException { //getting reference of directory Directory dir = FSDirectory.open(Paths.get(Index_Dir)); //Index reader - an interface for accessing a point-in-time view of a lucene index IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); //analyzer with the default stop words, takes out the stop words Analyzer analyzer = new StandardAnalyzer(); String contents = "contents"; QueryParser parser = new QueryParser(contents, analyzer); int numOfDoc = reader.numDocs(); for (int i = 0; i < numOfDoc; i++) { Document d = reader.document(i); }//from w ww .j ava 2 s. c o m Query q1 = parser.parse(s1); Query q2 = parser.parse(s2); //conjuction, disjunction and negation BooleanQuery.Builder bq = new BooleanQuery.Builder(); //occur.must : both queries required in a doc if (radioBtn.equals("conjunction")) { bq.add(q1, BooleanClause.Occur.MUST); bq.add(q2, BooleanClause.Occur.MUST); bq.build(); } //occur.should: one of the q1 should be presen t in doc else if (radioBtn.equals("disjunction")) { bq.add(q1, BooleanClause.Occur.SHOULD); bq.add(q2, BooleanClause.Occur.SHOULD); bq.build(); } //negation: first should present , second should not else { bq.add(q1, BooleanClause.Occur.MUST); bq.add(q2, BooleanClause.Occur.MUST_NOT); bq.build(); } TopDocs hits = searcher.search(bq.build(), 10); Formatter formatter = new SimpleHTMLFormatter(); QueryScorer scorer = new QueryScorer(bq.build()); //used to markup highlighted terms found in the best sections of a cont Highlighter highlighter = new Highlighter(formatter, scorer); //It breaks cont up into same-size texts but does not split up spans Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10); //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries. //set fragmenter to highlighter highlighter.setTextFragmenter(fragmenter); for (int i = 0; i < hits.scoreDocs.length; i++) { Bean bean = new Bean(); int outResult = hits.scoreDocs.length; bean.setNumFile(outResult); int docid = hits.scoreDocs[i].doc; double rank = hits.scoreDocs[i].score; bean.setRankSc(rank); Document doc = searcher.doc(docid); String name = doc.get("name"); String title = doc.get("title"); bean.setTitle(name); String path = doc.get("path"); bean.setPath(path); String cont = doc.get("contents"); //Create token stream TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer); //Get highlighted cont fragments String[] frags = highlighter.getBestFragments(stream, cont, 10); ArrayList<String> dummy = new ArrayList<>(); for (String frag : frags) { dummy.add(frag); } bean.setContent(dummy); beanList.add(bean); } dir.close(); // } return beanList; }
From source file:com.main.Searcher.java
public List<Bean> searching(String s1) throws IOException, ParseException, InvalidTokenOffsetsException { //Get directory reference Directory dir = FSDirectory.open(Paths.get(Index_Dir)); //Index reader - an interface for accessing a point-in-time view of a lucene index IndexReader reader = DirectoryReader.open(dir); //CreateIndexReader reader = DirectoryReader.open(dir); lucene searcher. It search over a single IndexReader. IndexSearcher searcher = new IndexSearcher(reader); //analyzer with the default stop words Analyzer analyzer = new StandardAnalyzer(); //Query parser to be used for creating TermQuery String queries = null;//from w w w.ja v a2 s. c o m String queryString = null; //regular search String contents = "contents"; BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(contents, analyzer); int numOfDoc = reader.numDocs(); for (int i = 0; i < numOfDoc; i++) { Document d = reader.document(i); } Query q1 = parser.parse(s1); BooleanQuery.Builder bq = new BooleanQuery.Builder(); bq.add(q1, BooleanClause.Occur.MUST); //Search the lucene documents TopDocs hits = searcher.search(bq.build(), 10); // TopScoreDocCollector collector = TopScoreDocCollector.create(5); /** * Highlighter Code Start *** */ //Uses HTML <B></B> tag to highlight the searched terms Formatter formatter = new SimpleHTMLFormatter(); //It scores cont fragments by the number of unique q1 terms found //Basically the matching score in layman terms QueryScorer scorer = new QueryScorer(bq.build()); //used to markup highlighted terms found in the best sections of a cont Highlighter highlighter = new Highlighter(formatter, scorer); //It breaks cont up into same-size texts but does not split up spans Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10); //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries. //set fragmenter to highlighter highlighter.setTextFragmenter(fragmenter); //Iterate over found results for (int i = 0; i < hits.scoreDocs.length; i++) { Bean bean = new Bean(); //int rank = hits.scoreDocs.length; int outResult = hits.scoreDocs.length; bean.setNumFile(outResult); int docid = hits.scoreDocs[i].doc; double rank = hits.scoreDocs[i].score; bean.setRankSc(rank); Document doc = searcher.doc(docid); // String title = doc.get("title"); String name = doc.get("name"); String title = doc.get("title"); bean.setTitle(name); String path = doc.get("path"); bean.setPath(path); String cont = doc.get("contents"); //Create token stream TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer); //Get highlighted cont fragments String[] frags = highlighter.getBestFragments(stream, cont, 10); ArrayList<String> dummy = new ArrayList<>(); for (String frag : frags) { dummy.add(frag); } bean.setContent(dummy); beanList.add(bean); } dir.close(); // } return beanList; }
From source file:Example.lucene.SearchNHilight.java
public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException { //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); String querystr = args.length > 0 ? args[0] : "golf user"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer) .parse(querystr);/* ww w. j a v a 2s. c o m*/ // 3. search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs hits = searcher.search(query, 10); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); String Preview; for (int i = 0; i < 10; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String text; Preview = ""; System.out.println(doc.get("url")); System.out.println(doc.get("title")); text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); int k = 0; for (TextFragment frag1 : frag) { if ((frag1 != null) && (frag1.getScore() > 0)) { Preview += (frag1.toString()) + "...<br>"; k++; // Get 2 Line Preview if (k >= 2) break; } } //Term vector System.out.println("-------------"); } }
From source file:it.cnr.ilc.lc.clavius.search.Tester.java
private static void searchWithHighlighter(String term) throws IOException, ParseException, InvalidTokenOffsetsException { logger.info("searchWithContext2 (" + term + ")"); Directory indexDirectory = FSDirectory .open(Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText")); DirectoryReader ireader = DirectoryReader.open(indexDirectory); IndexSearcher searcher = new IndexSearcher(ireader); QueryParser parser = new QueryParser("content", new StandardAnalyzer()); Query query = parser.parse(term); TopDocs hits = searcher.search(query, 10); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); //Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); ClaviusHighlighter highlighter = new ClaviusHighlighter(htmlFormatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(9)); for (int i = 0; i < hits.totalHits; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String idDoc = doc.get("idDoc"); String text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", new StandardAnalyzer()); List<Annotation> frag = highlighter.getBestTextClaviusFragments(tokenStream, idDoc, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); for (int j = 0; j < frag.size(); j++) { logger.info("idDoc: " + idDoc + ", Annotation[" + j + "] " + frag.get(j).toString()); }// w ww .j av a2s . c o m // TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); // for (int j = 0; j < frag.length; j++) { // if ((frag[j] != null) && (frag[j].getScore() > 0)) { // logger.info("frag["+j+"] "+frag[j].toString()); // } // } // } }
From source file:it.cnr.ilc.lc.claviusweb.ClaviusSearch.java
private static List<Annotation> fullTextSearch(String term) throws IOException, ParseException, InvalidTokenOffsetsException { log.info("fullTextSearch (" + term + ")"); List<Annotation> result = new ArrayList<>(); try {// ww w. ja v a2s . co m Directory indexDirectory = FSDirectory .open(Paths.get("/var/lucene/clavius-1.0.5/indexes/it.cnr.ilc.lc.claviusweb.entity.PlainText")); DirectoryReader ireader = DirectoryReader.open(indexDirectory); IndexSearcher searcher = new IndexSearcher(ireader); Analyzer fullTextAnalyzer = CustomAnalyzer.builder() .addCharFilter("patternReplace", "pattern", "([\\-\\(\\)\\[\\],\\.;:])", "replacement", " $1 ") .withTokenizer("whitespace").build(); //QueryParser parserTerm = new QueryParser("content", fullTextAnalyzer); // AnalyzingQueryParser parser = new AnalyzingQueryParser("content", fullTextAnalyzer); // Query query2 = parser.parse(term); // Query query = new WildcardQuery(new Term("content", term)); TopDocs hits = searcher.search(query, MAX_SEARCH_HITS); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); //Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); ClaviusHighlighter highlighter = new ClaviusHighlighter(htmlFormatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter()); log.info("hits.totalHits=(" + hits.totalHits + ")"); for (int i = 0; i < hits.totalHits; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String idDoc = doc.get("idDoc"); //String text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", fullTextAnalyzer); List<Annotation> frag = highlighter.getBestTextClaviusFragments(tokenStream, doc, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); for (int j = 0; j < frag.size(); j++) { log.debug("idDoc: " + idDoc + ", Annotation[" + j + "] " + frag.get(j).toString()); } result.addAll(frag); } } catch (InvalidTokenOffsetsException | IOException e) { log.error(e); } log.info("Full Text Search found " + result.size() + " result(s) for term " + term); return result; }
From source file:Main.WebAPI.Search.java
/** * // w w w . j a v a 2 s . c om * @param args args[0] is a query * * @throws IOException * @throws ParseException * @throws InvalidTokenOffsetsException */ public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException { //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); String querystr = args.length > 0 ? args[0] : "mike lab"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer) .parse(querystr); // 3. search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs hits = searcher.search(query, 10); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); String Preview; for (int i = 0; i < 10; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String text; Preview = ""; System.out.println(doc.get("url")); System.out.println(doc.get("title")); text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); int k = 0; for (TextFragment frag1 : frag) { if ((frag1 != null) && (frag1.getScore() > 0)) { Preview += (frag1.toString()) + "...<br>"; k++; // Get 2 Line Preview if (k >= 2) break; } } //Term vector System.out.println("-------------"); } }
From source file:org.apache.blur.utils.HighlightHelper.java
License:Apache License
/** * NOTE: This method will not preserve the correct field types. * //w w w .j a v a2 s. co m * @param preTag * @param postTag */ public static Document highlight(int docId, Document document, Query query, FieldManager fieldManager, IndexReader reader, String preTag, String postTag) throws IOException, InvalidTokenOffsetsException { String fieldLessFieldName = fieldManager.getFieldLessFieldName(); Query fixedQuery = fixSuperQuery(query, null, fieldLessFieldName); Analyzer analyzer = fieldManager.getAnalyzerForQuery(); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(preTag, postTag); Document result = new Document(); for (IndexableField f : document) { String name = f.name(); if (fieldLessFieldName.equals(name) || FIELDS_NOT_TO_HIGHLIGHT.contains(name)) { result.add(f); continue; } String text = f.stringValue(); Number numericValue = f.numericValue(); Query fieldFixedQuery; if (fieldManager.isFieldLessIndexed(name)) { fieldFixedQuery = fixSuperQuery(query, name, fieldLessFieldName); } else { fieldFixedQuery = fixedQuery; } if (numericValue != null) { if (shouldNumberBeHighlighted(name, numericValue, fieldFixedQuery)) { String numberHighlight = preTag + text + postTag; result.add(new StringField(name, numberHighlight, Store.YES)); } } else { Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(fieldFixedQuery, name)); TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, docId, name, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10); for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { result.add(new StringField(name, frag[j].toString(), Store.YES)); } } } } return result; }
From source file:perf.SearchTask.java
License:Apache License
private void hilite(int docID, IndexState indexState, IndexSearcher searcher) throws IOException { //System.out.println(" title=" + searcher.doc(docID).get("titleTokenized")); if (indexState.fastHighlighter != null) { for (String h : indexState.fastHighlighter.getBestFragments(fieldQuery, searcher.getIndexReader(), docID, indexState.textFieldName, 100, 2)) { totHiliteHash += h.hashCode(); //System.out.println(" frag: " + h); }/*from w w w . ja v a 2 s . c o m*/ } else { Document doc = searcher.doc(docID); String text = doc.get(indexState.textFieldName); // NOTE: passing null for analyzer: TermVectors must // be indexed! TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), docID, indexState.textFieldName, null); TextFragment[] frags; try { frags = highlighter.getBestTextFragments(tokenStream, text, false, 2); } catch (InvalidTokenOffsetsException ioe) { throw new RuntimeException(ioe); } for (int j = 0; j < frags.length; j++) { if (frags[j] != null && frags[j].getScore() > 0) { //System.out.println(" frag " + j + ": " + frags[j].toString()); totHiliteHash += frags[j].toString().hashCode(); } } } }