Example usage for org.apache.lucene.search.highlight WeightedTerm WeightedTerm

List of usage examples for org.apache.lucene.search.highlight WeightedTerm WeightedTerm

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight WeightedTerm WeightedTerm.

Prototype

public WeightedTerm(float weight, String term) 

Source Link

Usage

From source file:com.liferay.portal.search.lucene.highlight.QueryTermExtractor.java

License:Open Source License

public static WeightedTerm[] getTerms(Query query, boolean prohibited, String fieldName) {

    if (query == null) {
        return _emptyWeightedTermArray;
    }/*  ww w.  j a  va2s.  c  o  m*/

    Set<WeightedTerm> weightedTerms = new HashSet<WeightedTerm>();

    Set<Term> terms = new HashSet<Term>();

    LinkedList<Query> queries = new LinkedList<Query>();

    Query lastQuery = query;

    while (lastQuery != null) {
        if (lastQuery instanceof BooleanQuery) {
            BooleanQuery booleanQuery = (BooleanQuery) lastQuery;

            BooleanClause[] booleanClauses = booleanQuery.getClauses();

            for (BooleanClause booleanClause : booleanClauses) {
                if (prohibited || (booleanClause.getOccur() != BooleanClause.Occur.MUST_NOT)) {

                    Query booleanClauseQuery = booleanClause.getQuery();

                    if (booleanClauseQuery != null) {
                        queries.addFirst(booleanClauseQuery);
                    }
                }
            }

            lastQuery = queries.poll();
        } else if (lastQuery instanceof FilteredQuery) {
            FilteredQuery filteredQuery = (FilteredQuery) lastQuery;

            lastQuery = filteredQuery.getQuery();

            if (lastQuery == null) {
                lastQuery = queries.poll();
            }
        } else {
            Class<? extends Query> queryClass = lastQuery.getClass();

            if (!_queryClasses.contains(queryClass)) {
                try {
                    lastQuery.extractTerms(terms);

                    for (Term term : terms) {
                        if ((fieldName == null) || fieldName.equals(term.field())) {

                            WeightedTerm weightedTerm = new WeightedTerm(query.getBoost(), term.text());

                            weightedTerms.add(weightedTerm);
                        }
                    }

                    terms.clear();
                } catch (UnsupportedOperationException uoe) {
                    _queryClasses.addIfAbsent(queryClass);
                }
            }

            lastQuery = queries.poll();
        }
    }

    return weightedTerms.toArray(new WeightedTerm[weightedTerms.size()]);
}

From source file:org.apache.nutch.summary.lucene.LuceneSummarizer.java

License:Apache License

public Summary getSummary(String text, Query query) {

    String[] terms = query.getTerms();
    WeightedTerm[] weighted = new WeightedTerm[terms.length];
    for (int i = 0; i < terms.length; i++) {
        weighted[i] = new WeightedTerm(1.0f, terms[i]);
    }/*w w  w .j a va 2s .  c  o  m*/
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
        // TODO : The max number of fragments (3) should be configurable
        String[] result = highlighter.getBestFragments(tokens, text, 3);
        for (int i = 0; i < result.length; i++) {
            String[] parts = result[i].split(SEPARATOR);
            boolean highlight = false;
            for (int j = 0; j < parts.length; j++) {
                if (highlight) {
                    summary.add(new Highlight(parts[j]));
                } else {
                    summary.add(new Fragment(parts[j]));
                }
                highlight = !highlight;
            }
            summary.add(new Ellipsis());
        }

        /* TODO MC  BUG resolved 0000029 - if query terms do not occur on text, an empty summary is returned. Now it sends the first tokens. */
        if (result == null || result.length == 0) {
            tokens = analyzer.tokenStream("content", new StringReader(text));

            Token firstToken = null, lastToken = null;
            Token token = null;
            int maxLen = 100; // the same as defined in SimpleFragmenter but it is private

            /*
            ArrayList<Token> titleTokens=new ArrayList<Token>();
            ArrayList<Token> textTokens=new ArrayList<Token>();
            boolean titleMatched=false;
            boolean hasMatched=false; // exit match after match title the first time               
                    
            // remove title from text. compares pairs of text
            while ((titleMatched || !hasMatched) && (token=tokens.next())!=null) {
                       
               if (token.type().equals("<WORD>")) {
                       
                  if (titleTokens.size()==0) {
                     titleTokens.add(token);
                  }
                  else if (textTokens.size()<titleTokens.size()) {
                     textTokens.add(token);
                  }
                       
                  if (textTokens.size()==titleTokens.size()) {
                     // compare
                     titleMatched=true;
                     for (int i=0;i<textTokens.size() && titleMatched;i++) {
             if (!textTokens.get(i).termText().equals(titleTokens.get(i).termText())) {
                titleMatched=false;     
             }                     
                     }
                     if (titleMatched) { // try to match a larger pattern
             titleTokens.add(textTokens.get(0));
             textTokens.remove(0);
             hasMatched=true;
                     }
                     else { // remove rest of title from text
             if (hasMatched) {
                firstToken=textTokens.get(titleTokens.size()-2);                                              
             }
             else { // add one more token to title
                titleTokens.add(textTokens.get(0));
                 textTokens.remove(0);
             }
                     }
                  }
               }          
            }
                    
            if (textTokens.size()==0) {
               return summary;
            }
                                  
            for (int i=0;i<textTokens.size() && textTokens.get(i).endOffset()-firstToken.startOffset()<maxLen;i++) {
               lastToken=textTokens.get(i);
            }
            */

            // read tokens until maxLen
            while ((token = tokens.next()) != null) {
                if (token.type().equals("<WORD>")) {
                    if (firstToken == null) {
                        firstToken = token;
                    } else if (token.endOffset() - firstToken.startOffset() < maxLen) {
                        lastToken = token;
                    } else {
                        break;
                    }
                }
            }
            if (lastToken == null) {
                lastToken = firstToken;
            }

            summary.add(new Fragment(text.substring(firstToken.startOffset(), lastToken.endOffset())));
            summary.add(new Ellipsis());
        }
        /* TODO MC */

    } catch (Exception e) {
        // Nothing to do...
    }
    return summary;
}