Example usage for org.apache.lucene.search.highlight SimpleHTMLFormatter SimpleHTMLFormatter

List of usage examples for org.apache.lucene.search.highlight SimpleHTMLFormatter SimpleHTMLFormatter

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight SimpleHTMLFormatter SimpleHTMLFormatter.

Prototype

public SimpleHTMLFormatter() 

Source Link

Document

Default constructor uses HTML: <B> tags to markup terms.

Usage

From source file:ch.admin.isb.hermes5.business.search.HighlighterRepository.java

License:Apache License

public HighlighterWrapper getHighlighter(Analyzer analyzer, IndexSearcher isearcher, Query query) {
    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
    return new HighlighterWrapper(highlighter, numberOfFragments.getIntegerValue(), isearcher, analyzer,
            trimstringsList());/*from   ww w .  jav  a  2s .c  om*/
}

From source file:ci6226.eval_index_reader.java

public static void Searchit(IndexReader reader, IndexSearcher searcher, Analyzer _analyzer, String field,
        String[] _searchList, int _topn, PrintWriter writer)
        throws org.apache.lucene.queryparser.classic.ParseException, IOException, InvalidTokenOffsetsException {
    Analyzer analyzer = _analyzer;//from  w  w  w  .  j a  v  a2  s. co m

    QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer);

    String[] testString = _searchList;//{"to","123","impressed","Geezer","geezer","semi-busy","\"eggs vegetable\"","gs veget","\"gs veget\""};//,"good","I","but","coffee"};

    for (int j = 0; j < testString.length; j++) {
        String lstr = String.valueOf(j) + "," + testString[j];
        Query query = parser.parse(testString[j]);
        System.out.println("Searching for: " + query.toString(field));
        TopDocs topdocs = searcher.search(query, _topn);
        lstr += "," + topdocs.totalHits;
        ScoreDoc[] scoreDocs = topdocs.scoreDocs;
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query.rewrite(reader)));
        for (int i = 0; i < scoreDocs.length; i++) {
            int doc = scoreDocs[i].doc;
            Document document = searcher.doc(doc);
            //      System.out.println("Snippet=" + document.get(field));
            System.out.println(i);
            String text = document.get(field);
            TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), doc, field,
                    analyzer);
            TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
            String line = "";
            for (int m = 0; m < frag.length; m++) {

                if ((frag[m] != null) && (frag[m].getScore() > 0)) {
                    System.out.println((frag[m].toString()));
                    line = frag[m].toString();
                    line = line.replaceAll("\n", "");
                    line = line.replaceAll("\r", "");
                    line = line.replaceAll("\"", "");
                    line = line.replaceAll(",", " ");

                }

            }
            lstr += "," + line;
            lstr += "," + String.valueOf(scoreDocs[i].score);

        }
        writer.write(lstr + "\n");
        System.out.println("Search for:" + testString[j] + " Total hits=" + scoreDocs.length);
        System.out.println("////////////////////////////////////////////////////");
    }

}

From source file:com.bewsia.script.safe.lucene.SEntity.java

License:Open Source License

public String highlight(Query query, String text, String field, int fragmentSize, int maxNumFragments,
        String separator) throws Exception {
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
    CachingTokenFilter tokenStream = new CachingTokenFilter(
            analyzer.tokenStream(field, new StringReader(text)));
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
    Scorer scorer = new org.apache.lucene.search.highlight.QueryScorer(query);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(new SimpleFragmenter(fragmentSize));
    tokenStream.reset();/*w  w  w.ja va  2  s.com*/
    String rv = highlighter.getBestFragments(tokenStream, text, maxNumFragments, separator);
    return rv.length() == 0 ? text : rv;
}

From source file:com.isotrol.impe3.nr.core.NodeRepositoryImpl.java

License:Open Source License

public NodeRepositoryImpl(final Queryable queryable, final Analyzer analyzer) {
    this(queryable, analyzer, new SimpleHTMLFormatter());
}

From source file:com.isotrol.impe3.oi.nr.OiNodeRepository.java

License:Open Source License

/**
 * Creates a new queryable from store// ww  w  . ja va  2  s .c o m
 * @see org.springframework.beans.factory.InitializingBean#afterPropertiesSet()
 */
public void afterPropertiesSet() throws Exception {
    Preconditions.checkNotNull(store, "A valid store required!");
    queryable = Queryables.simple(store);
    if (analyzer == null) {
        analyzer = new PortalStandardAnalyzer();
    }
    if (formatter == null) {
        formatter = new SimpleHTMLFormatter();
    }
    // translator = new QueryTranslator(analyzer);
    // BooleanQuery.setMaxClauseCount(maxClauseCount);
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1, String s2, String radioBtn)
        throws IOException, ParseException, InvalidTokenOffsetsException {
    //getting reference of directory
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));

    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words, takes out the stop words
    Analyzer analyzer = new StandardAnalyzer();

    String contents = "contents";

    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }/* w  w w .ja  v  a 2 s  . com*/

    Query q1 = parser.parse(s1);
    Query q2 = parser.parse(s2);

    //conjuction, disjunction and negation
    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    //occur.must : both queries required in a doc
    if (radioBtn.equals("conjunction")) {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST);
        bq.build();
    } //occur.should: one of the q1 should be presen t in doc
    else if (radioBtn.equals("disjunction")) {
        bq.add(q1, BooleanClause.Occur.SHOULD);
        bq.add(q2, BooleanClause.Occur.SHOULD);
        bq.build();
    } //negation: first should present , second should not
    else {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST_NOT);
        bq.build();
    }

    TopDocs hits = searcher.search(bq.build(), 10);

    Formatter formatter = new SimpleHTMLFormatter();

    QueryScorer scorer = new QueryScorer(bq.build());

    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);

    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();

        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);

        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1) throws IOException, ParseException, InvalidTokenOffsetsException {
    //Get directory reference
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));
    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);
    //CreateIndexReader reader = DirectoryReader.open(dir); lucene searcher. It search over a single IndexReader.
    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words
    Analyzer analyzer = new StandardAnalyzer();
    //Query parser to be used for creating TermQuery

    String queries = null;/* w ww .j  ava 2s  . c  o  m*/
    String queryString = null; //regular search
    String contents = "contents";
    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }

    Query q1 = parser.parse(s1);

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    bq.add(q1, BooleanClause.Occur.MUST);
    //Search the lucene documents
    TopDocs hits = searcher.search(bq.build(), 10);
    // TopScoreDocCollector collector = TopScoreDocCollector.create(5);
    /**
     * Highlighter Code Start ***
     */
    //Uses HTML &lt;B&gt;&lt;/B&gt; tag to highlight the searched terms
    Formatter formatter = new SimpleHTMLFormatter();
    //It scores cont fragments by the number of unique q1 terms found
    //Basically the matching score in layman terms
    QueryScorer scorer = new QueryScorer(bq.build());
    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);
    //Iterate over found results
    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();
        //int rank = hits.scoreDocs.length;
        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);
        // String title = doc.get("title");
        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.meltmedia.cadmium.search.SearchService.java

License:Apache License

private Map<String, Object> buildSearchResults(final String query, final String path) throws Exception {
    logger.info("Running search for [{}]", query);
    final Map<String, Object> resultMap = new LinkedHashMap<String, Object>();

    new SearchTemplate(provider) {
        public void doSearch(IndexSearcher index) throws IOException, ParseException {
            QueryParser parser = createParser(getAnalyzer());

            resultMap.put("number-hits", 0);

            List<Map<String, Object>> resultList = new ArrayList<Map<String, Object>>();

            resultMap.put("results", resultList);

            if (index != null && parser != null) {
                String literalQuery = query.replaceAll(ALLOWED_CHARS_PATTERN, "\\\\$1");
                Query query1 = parser.parse(literalQuery);
                if (StringUtils.isNotBlank(path)) {
                    Query pathPrefix = new PrefixQuery(new Term("path", path));
                    BooleanQuery boolQuery = new BooleanQuery();
                    boolQuery.add(pathPrefix, Occur.MUST);
                    boolQuery.add(query1, Occur.MUST);
                    query1 = boolQuery;/*  w ww.  j ava 2 s.c  o m*/
                }
                TopDocs results = index.search(query1, null, 100000);
                QueryScorer scorer = new QueryScorer(query1);
                Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
                        scorer);

                logger.info("Search returned {} hits.", results.totalHits);
                resultMap.put("number-hits", results.totalHits);

                for (ScoreDoc doc : results.scoreDocs) {
                    Document document = index.doc(doc.doc);
                    String content = document.get("content");
                    String title = document.get("title");

                    Map<String, Object> result = new LinkedHashMap<String, Object>();
                    String excerpt = "";

                    try {
                        excerpt = highlighter.getBestFragments(
                                parser.getAnalyzer().tokenStream(null, new StringReader(content)), content, 3,
                                "...");
                        excerpt = fixExcerpt(excerpt);

                        result.put("excerpt", excerpt);
                    } catch (Exception e) {
                        logger.debug("Failed to get search excerpt from content.", e);

                        try {
                            excerpt = highlighter.getBestFragments(
                                    parser.getAnalyzer().tokenStream(null, new StringReader(title)), title, 1,
                                    "...");
                            excerpt = fixExcerpt(excerpt);

                            result.put("excerpt", excerpt);
                        } catch (Exception e1) {
                            logger.debug("Failed to get search excerpt from title.", e1);

                            result.put("excerpt", "");
                        }
                    }

                    result.put("score", doc.score);
                    result.put("title", title);
                    result.put("path", document.get("path"));

                    resultList.add(result);
                }
            }

        }
    }.search();

    return resultMap;
}

From source file:com.novartis.pcs.ontology.service.search.OntologySearchServiceImpl.java

License:Apache License

@Override
public List<HTMLSearchResult> search(String pattern, boolean includeSynonyms)
        throws InvalidQuerySyntaxException {
    Analyzer analyzer = null;/*from  w w  w . j av  a2  s.c o  m*/

    // default QueryParser.escape(pattern) method does not support phrase queries
    pattern = QuerySyntaxUtil.escapeQueryPattern(pattern);
    if (pattern.length() < EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE) {
        return Collections.emptyList();
    }

    logger.log(Level.FINE, "Escaped search pattern: " + pattern);

    Lock lock = rwlock.readLock();
    lock.lock();
    if (exception != null) {
        lock.unlock();
        throw new RuntimeException("Failed to refesh index reader after last commit", exception);
    }

    try {
        List<HTMLSearchResult> results = new ArrayList<HTMLSearchResult>();
        analyzer = new TermNameAnalyzer(false);

        QueryParser parser = new QueryParser(Version.LUCENE_30, FIELD_TERM, analyzer);
        Query query = parser.parse(pattern);

        logger.log(Level.FINE, "Query: " + query);

        // For highlighting words in query results
        QueryScorer scorer = new QueryScorer(query, reader, FIELD_TERM);
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        SimpleHTMLEncoder htmlEncoder = new SimpleHTMLEncoder();
        Highlighter highlighter = new Highlighter(htmlFormatter, htmlEncoder, scorer);
        highlighter.setMaxDocCharsToAnalyze(MAX_CHARS);
        scorer.setExpandMultiTermQuery(true);

        // Perform search
        ScoreDoc[] hits = searcher.search(query, numberOfDocuments).scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            int id = hits[i].doc;
            Document doc = searcher.doc(id);
            String ontology = doc.get(FIELD_ONTOLOGY);
            String referenceId = doc.get(FIELD_ID);
            String term = doc.get(FIELD_TERM);
            byte[] synonymBytes = doc.getBinaryValue(FIELD_SYNONYM);
            boolean isSynonym = synonymBytes != null && synonymBytes.length == 1 && synonymBytes[0] == 1;

            if (!isSynonym || includeSynonyms) {
                Analyzer highlighterAnalyzer = new TermNameAnalyzer(true);
                TokenStream tokenStream = TokenSources.getTokenStream(reader, id, FIELD_TERM,
                        highlighterAnalyzer);
                TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, term, true, 1);
                if (frag.length > 0 && frag[0] != null && frag[0].getScore() > 0) {
                    results.add(new HTMLSearchResult(ontology, referenceId, term, frag[0].toString(),
                            frag[0].getScore(), isSynonym));
                }
                highlighterAnalyzer.close();
            }
        }

        return results;
    } catch (ParseException e) {
        throw new InvalidQuerySyntaxException(e.getMessage(), e);
    } catch (TokenMgrError e) {
        throw new InvalidQuerySyntaxException(e.getMessage(), e);
    } catch (Throwable e) {
        String msg = "Failed to perform Lucene seach with pattern: " + pattern;
        logger.log(Level.WARNING, msg, e);
        throw new RuntimeException(msg, e);
    } finally {
        close(analyzer);
        lock.unlock();
    }
}

From source file:de.fhg.iais.cortex.search.TermRememberingFormatterTest.java

License:Apache License

@Test
public void testSimpleTest() {
    TermRememberingFormatter formatter = new TermRememberingFormatter(new SimpleHTMLFormatter());
    formatter.clearHighlightedTerms();//from  w w  w .j a v  a2 s. com
    TokenGroup tokenGroup = new TokenGroup(new EmptyTokenStream());
    formatter.highlightTerm("foo bar jelly juice pumpkin", tokenGroup);
    List<String> terms = formatter.getHighlightedTerms();

    Assert.assertTrue(terms.isEmpty());
}