Example usage for org.apache.lucene.search.highlight TokenSources getAnyTokenStream

List of usage examples for org.apache.lucene.search.highlight TokenSources getAnyTokenStream

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight TokenSources getAnyTokenStream.

Prototype

@Deprecated 
public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer)
        throws IOException 

Source Link

Document

A convenience method that tries a number of approaches to getting a token stream.

Usage

From source file:ch.admin.isb.hermes5.business.search.HighlighterWrapper.java

License:Apache License

public String getHighlightedText(int id, String contentField) throws IOException {
    try {//from   ww  w .j  a  va 2 s. com
        Document hitDoc = isearcher.doc(id);
        String text = hitDoc.get(contentField);

        TokenStream tokenStream = TokenSources.getAnyTokenStream(isearcher.getIndexReader(), id, contentField,
                analyzer);
        String fragment = highlighter.getBestFragments(tokenStream, text, numberOfFragments, "...");
        return trimFragment(fragment);
    } catch (InvalidTokenOffsetsException e) {
        throw new RuntimeException(e);
    }
}

From source file:ci6226.eval_index_reader.java

public static void Searchit(IndexReader reader, IndexSearcher searcher, Analyzer _analyzer, String field,
        String[] _searchList, int _topn, PrintWriter writer)
        throws org.apache.lucene.queryparser.classic.ParseException, IOException, InvalidTokenOffsetsException {
    Analyzer analyzer = _analyzer;//  w  ww  .  ja v  a 2  s  . c om

    QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer);

    String[] testString = _searchList;//{"to","123","impressed","Geezer","geezer","semi-busy","\"eggs vegetable\"","gs veget","\"gs veget\""};//,"good","I","but","coffee"};

    for (int j = 0; j < testString.length; j++) {
        String lstr = String.valueOf(j) + "," + testString[j];
        Query query = parser.parse(testString[j]);
        System.out.println("Searching for: " + query.toString(field));
        TopDocs topdocs = searcher.search(query, _topn);
        lstr += "," + topdocs.totalHits;
        ScoreDoc[] scoreDocs = topdocs.scoreDocs;
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query.rewrite(reader)));
        for (int i = 0; i < scoreDocs.length; i++) {
            int doc = scoreDocs[i].doc;
            Document document = searcher.doc(doc);
            //      System.out.println("Snippet=" + document.get(field));
            System.out.println(i);
            String text = document.get(field);
            TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), doc, field,
                    analyzer);
            TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
            String line = "";
            for (int m = 0; m < frag.length; m++) {

                if ((frag[m] != null) && (frag[m].getScore() > 0)) {
                    System.out.println((frag[m].toString()));
                    line = frag[m].toString();
                    line = line.replaceAll("\n", "");
                    line = line.replaceAll("\r", "");
                    line = line.replaceAll("\"", "");
                    line = line.replaceAll(",", " ");

                }

            }
            lstr += "," + line;
            lstr += "," + String.valueOf(scoreDocs[i].score);

        }
        writer.write(lstr + "\n");
        System.out.println("Search for:" + testString[j] + " Total hits=" + scoreDocs.length);
        System.out.println("////////////////////////////////////////////////////");
    }

}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1, String s2, String radioBtn)
        throws IOException, ParseException, InvalidTokenOffsetsException {
    //getting reference of directory
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));

    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words, takes out the stop words
    Analyzer analyzer = new StandardAnalyzer();

    String contents = "contents";

    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }//from  w  ww  .j ava  2 s. c o  m

    Query q1 = parser.parse(s1);
    Query q2 = parser.parse(s2);

    //conjuction, disjunction and negation
    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    //occur.must : both queries required in a doc
    if (radioBtn.equals("conjunction")) {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST);
        bq.build();
    } //occur.should: one of the q1 should be presen t in doc
    else if (radioBtn.equals("disjunction")) {
        bq.add(q1, BooleanClause.Occur.SHOULD);
        bq.add(q2, BooleanClause.Occur.SHOULD);
        bq.build();
    } //negation: first should present , second should not
    else {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST_NOT);
        bq.build();
    }

    TopDocs hits = searcher.search(bq.build(), 10);

    Formatter formatter = new SimpleHTMLFormatter();

    QueryScorer scorer = new QueryScorer(bq.build());

    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);

    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();

        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);

        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1) throws IOException, ParseException, InvalidTokenOffsetsException {
    //Get directory reference
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));
    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);
    //CreateIndexReader reader = DirectoryReader.open(dir); lucene searcher. It search over a single IndexReader.
    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words
    Analyzer analyzer = new StandardAnalyzer();
    //Query parser to be used for creating TermQuery

    String queries = null;//from  w  w w.ja v  a2  s. c o m
    String queryString = null; //regular search
    String contents = "contents";
    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }

    Query q1 = parser.parse(s1);

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    bq.add(q1, BooleanClause.Occur.MUST);
    //Search the lucene documents
    TopDocs hits = searcher.search(bq.build(), 10);
    // TopScoreDocCollector collector = TopScoreDocCollector.create(5);
    /**
     * Highlighter Code Start ***
     */
    //Uses HTML &lt;B&gt;&lt;/B&gt; tag to highlight the searched terms
    Formatter formatter = new SimpleHTMLFormatter();
    //It scores cont fragments by the number of unique q1 terms found
    //Basically the matching score in layman terms
    QueryScorer scorer = new QueryScorer(bq.build());
    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);
    //Iterate over found results
    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();
        //int rank = hits.scoreDocs.length;
        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);
        // String title = doc.get("title");
        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:Example.lucene.SearchNHilight.java

public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
    //... Above, create documents with two fields, one with term vectors (tv) and one without (notv)
    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    Directory index = FSDirectory.open(new File("data/indexing"));
    String querystr = args.length > 0 ? args[0] : "golf user";
    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer)
            .parse(querystr);/*  ww  w. j a  v a  2s.  c  o m*/

    // 3. search
    int hitsPerPage = 10;
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);

    TopDocs hits = searcher.search(query, 10);

    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
    String Preview;
    for (int i = 0; i < 10; i++) {
        int id = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(id);
        String text;
        Preview = "";
        System.out.println(doc.get("url"));
        System.out.println(doc.get("title"));
        text = doc.get("content");
        TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content",
                analyzer);
        TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
        int k = 0;
        for (TextFragment frag1 : frag) {
            if ((frag1 != null) && (frag1.getScore() > 0)) {
                Preview += (frag1.toString()) + "...<br>";
                k++;
                // Get 2 Line Preview
                if (k >= 2)
                    break;
            }
        }
        //Term vector
        System.out.println("-------------");
    }
}

From source file:it.cnr.ilc.lc.clavius.search.Tester.java

private static void searchWithHighlighter(String term)
        throws IOException, ParseException, InvalidTokenOffsetsException {

    logger.info("searchWithContext2 (" + term + ")");
    Directory indexDirectory = FSDirectory
            .open(Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText"));
    DirectoryReader ireader = DirectoryReader.open(indexDirectory);

    IndexSearcher searcher = new IndexSearcher(ireader);
    QueryParser parser = new QueryParser("content", new StandardAnalyzer());
    Query query = parser.parse(term);

    TopDocs hits = searcher.search(query, 10);

    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    //Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
    ClaviusHighlighter highlighter = new ClaviusHighlighter(htmlFormatter, new QueryScorer(query));
    highlighter.setTextFragmenter(new SimpleFragmenter(9));
    for (int i = 0; i < hits.totalHits; i++) {
        int id = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(id);
        String idDoc = doc.get("idDoc");
        String text = doc.get("content");
        TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content",
                new StandardAnalyzer());
        List<Annotation> frag = highlighter.getBestTextClaviusFragments(tokenStream, idDoc, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
        for (int j = 0; j < frag.size(); j++) {
            logger.info("idDoc: " + idDoc + ", Annotation[" + j + "] " + frag.get(j).toString());
        }// w ww  .j av  a2s  .  c  o  m
        //            TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
        //            for (int j = 0; j < frag.length; j++) {
        //                if ((frag[j] != null) && (frag[j].getScore() > 0)) {
        //                    logger.info("frag["+j+"] "+frag[j].toString());
        //                }
        //            }
        //            
    }
}

From source file:it.cnr.ilc.lc.claviusweb.ClaviusSearch.java

private static List<Annotation> fullTextSearch(String term)
        throws IOException, ParseException, InvalidTokenOffsetsException {

    log.info("fullTextSearch (" + term + ")");
    List<Annotation> result = new ArrayList<>();

    try {//  ww w. ja v  a2s .  co m
        Directory indexDirectory = FSDirectory
                .open(Paths.get("/var/lucene/clavius-1.0.5/indexes/it.cnr.ilc.lc.claviusweb.entity.PlainText"));
        DirectoryReader ireader = DirectoryReader.open(indexDirectory);

        IndexSearcher searcher = new IndexSearcher(ireader);

        Analyzer fullTextAnalyzer = CustomAnalyzer.builder()
                .addCharFilter("patternReplace", "pattern", "([\\-\\(\\)\\[\\],\\.;:])", "replacement", " $1 ")
                .withTokenizer("whitespace").build();

        //QueryParser parserTerm = new QueryParser("content", fullTextAnalyzer);
        //            AnalyzingQueryParser parser = new AnalyzingQueryParser("content", fullTextAnalyzer);
        //            Query query2 = parser.parse(term);
        //            
        Query query = new WildcardQuery(new Term("content", term));
        TopDocs hits = searcher.search(query, MAX_SEARCH_HITS);

        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        //Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
        ClaviusHighlighter highlighter = new ClaviusHighlighter(htmlFormatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter());

        log.info("hits.totalHits=(" + hits.totalHits + ")");
        for (int i = 0; i < hits.totalHits; i++) {
            int id = hits.scoreDocs[i].doc;
            Document doc = searcher.doc(id);
            String idDoc = doc.get("idDoc");

            //String text = doc.get("content");
            TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content",
                    fullTextAnalyzer);

            List<Annotation> frag = highlighter.getBestTextClaviusFragments(tokenStream, doc, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
            for (int j = 0; j < frag.size(); j++) {
                log.debug("idDoc: " + idDoc + ", Annotation[" + j + "] " + frag.get(j).toString());
            }
            result.addAll(frag);
        }
    } catch (InvalidTokenOffsetsException | IOException e) {
        log.error(e);
    }
    log.info("Full Text Search found " + result.size() + " result(s) for term " + term);
    return result;
}

From source file:Main.WebAPI.Search.java

/**
 * // w w w  .  j a  v a  2 s  .  c  om
 * @param args args[0] is a query
 * 
 * @throws IOException
 * @throws ParseException
 * @throws InvalidTokenOffsetsException 
 */

public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
    //... Above, create documents with two fields, one with term vectors (tv) and one without (notv)
    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    Directory index = FSDirectory.open(new File("data/indexing"));
    String querystr = args.length > 0 ? args[0] : "mike lab";
    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer)
            .parse(querystr);

    // 3. search
    int hitsPerPage = 10;
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);

    TopDocs hits = searcher.search(query, 10);

    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
    String Preview;
    for (int i = 0; i < 10; i++) {
        int id = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(id);
        String text;
        Preview = "";
        System.out.println(doc.get("url"));
        System.out.println(doc.get("title"));
        text = doc.get("content");
        TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content",
                analyzer);
        TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
        int k = 0;
        for (TextFragment frag1 : frag) {
            if ((frag1 != null) && (frag1.getScore() > 0)) {
                Preview += (frag1.toString()) + "...<br>";
                k++;
                // Get 2 Line Preview
                if (k >= 2)
                    break;
            }
        }
        //Term vector
        System.out.println("-------------");
    }
}

From source file:org.apache.blur.utils.HighlightHelper.java

License:Apache License

/**
 * NOTE: This method will not preserve the correct field types.
 * //w w  w  .j a  v  a2 s. co m
 * @param preTag
 * @param postTag
 */
public static Document highlight(int docId, Document document, Query query, FieldManager fieldManager,
        IndexReader reader, String preTag, String postTag) throws IOException, InvalidTokenOffsetsException {

    String fieldLessFieldName = fieldManager.getFieldLessFieldName();

    Query fixedQuery = fixSuperQuery(query, null, fieldLessFieldName);

    Analyzer analyzer = fieldManager.getAnalyzerForQuery();

    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(preTag, postTag);
    Document result = new Document();
    for (IndexableField f : document) {
        String name = f.name();
        if (fieldLessFieldName.equals(name) || FIELDS_NOT_TO_HIGHLIGHT.contains(name)) {
            result.add(f);
            continue;
        }
        String text = f.stringValue();
        Number numericValue = f.numericValue();

        Query fieldFixedQuery;
        if (fieldManager.isFieldLessIndexed(name)) {
            fieldFixedQuery = fixSuperQuery(query, name, fieldLessFieldName);
        } else {
            fieldFixedQuery = fixedQuery;
        }

        if (numericValue != null) {
            if (shouldNumberBeHighlighted(name, numericValue, fieldFixedQuery)) {
                String numberHighlight = preTag + text + postTag;
                result.add(new StringField(name, numberHighlight, Store.YES));
            }
        } else {
            Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(fieldFixedQuery, name));
            TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, docId, name, analyzer);
            TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
            for (int j = 0; j < frag.length; j++) {
                if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                    result.add(new StringField(name, frag[j].toString(), Store.YES));
                }
            }
        }
    }
    return result;
}

From source file:perf.SearchTask.java

License:Apache License

private void hilite(int docID, IndexState indexState, IndexSearcher searcher) throws IOException {
    //System.out.println("  title=" + searcher.doc(docID).get("titleTokenized"));
    if (indexState.fastHighlighter != null) {
        for (String h : indexState.fastHighlighter.getBestFragments(fieldQuery, searcher.getIndexReader(),
                docID, indexState.textFieldName, 100, 2)) {
            totHiliteHash += h.hashCode();
            //System.out.println("    frag: " + h);
        }/*from  w w w  . ja v  a 2  s . c o  m*/
    } else {
        Document doc = searcher.doc(docID);
        String text = doc.get(indexState.textFieldName);
        // NOTE: passing null for analyzer: TermVectors must
        // be indexed!
        TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), docID,
                indexState.textFieldName, null);
        TextFragment[] frags;
        try {
            frags = highlighter.getBestTextFragments(tokenStream, text, false, 2);
        } catch (InvalidTokenOffsetsException ioe) {
            throw new RuntimeException(ioe);
        }

        for (int j = 0; j < frags.length; j++) {
            if (frags[j] != null && frags[j].getScore() > 0) {
                //System.out.println("    frag " + j + ": " + frags[j].toString());
                totHiliteHash += frags[j].toString().hashCode();
            }
        }
    }
}