SimilarityQueries.java :  » Search-Engine » lucene » org » apache » lucene » search » similar » Java Open Source

Java Open Source » Search Engine » lucene 
lucene » org » apache » lucene » search » similar » SimilarityQueries.java
/**
 * Copyright 2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.similar;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;

/**
 * Simple similarity measures.
 *
 * @see MoreLikeThis
 */
public final class SimilarityQueries
{
  /**
   *
   */
  private SimilarityQueries()
  {
  }
  
  /**
   * Simple similarity query generators.
   * Takes every unique word and forms a boolean query where all words are optional.
   * After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
   * The only caveat is the first hit returned <b>should be</b> your source document - you'll
   * need to then ignore that.
   *
   * <p>
   * So, if you have a code fragment like this:
   * <br>
   * <code>
   * Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
   * </code>
   *
   * <p>
   * The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
   *
   * <p>
   * The philosophy behind this method is "two documents are similar if they share lots of words".
   * Note that behind the scenes, Lucene's scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
   *
   * <P>
   * This method is fail-safe in that if a long 'body' is passed in and
   * {@link BooleanQuery#add BooleanQuery.add()} (used internally)
   * throws
   * {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
   * query as it is will be returned.
   *
   * @param body the body of the document you want to find similar documents to
   * @param a the analyzer to use to parse the body
   * @param field the field you want to search on, probably something like "contents" or "body"
   * @param stop optional set of stop words to ignore
   * @return a query with all unique words in 'body'
   * @throws IOException this can't happen...
   */
    public static Query formSimilarQuery( String body,
                      Analyzer a,
                      String field,
                      Set<?> stop)
                      throws IOException
  {  
    TokenStream ts = a.tokenStream( field, new StringReader( body));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    
    BooleanQuery tmp = new BooleanQuery();
    Set<String> already = new HashSet<String>(); // ignore dups
    while (ts.incrementToken()) {
      String word = termAtt.term();
      // ignore opt stop words
      if ( stop != null &&
         stop.contains( word)) continue;
      // ignore dups
      if ( ! already.add( word)) continue;
      // add to query
      TermQuery tq = new TermQuery( new Term( field, word));
      try
      {
        tmp.add( tq, BooleanClause.Occur.SHOULD);
      }
      catch( BooleanQuery.TooManyClauses too)
      {
        // fail-safe, just return what we have, not the end of the world
        break;
      }
    }
    return tmp;
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.