org.apache.nutch.searcher.NutchBean.java Source code

Introduction

Here is the source code for org.apache.nutch.searcher.NutchBean.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.searcher;

import java.io.*;
import java.util.*;
import javax.servlet.ServletContext;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.Closeable;
import org.apache.hadoop.conf.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.indexer.*;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;

import org.apache.lucene.search.PwaFunctionsWritable;

import org.apache.nutch.global.Global;

/** 
 * One stop shopping for search-related functionality.
 * @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
 */
public class NutchBean implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
        DistributedSearch.Protocol, Closeable {

    public static final Log LOG = LogFactory.getLog(NutchBean.class);

    public static final int MATCHED_DOCS_CONST_IGNORE = -2;

    //  static {
    //    LogFormatter.setShowThreadIDs(true);
    //  }

    private String[] segmentNames;

    private Searcher searcher;
    private HitDetailer detailer;
    private HitSummarizer summarizer;
    private HitContent content;
    private HitInlinks linkDb;

    /** BooleanQuery won't permit more than 32 required/prohibited clauses.  We
     * don't want to use too many of those. */
    private static final int MAX_PROHIBITED_TERMS = 20;

    private Configuration conf;
    private FileSystem fs;

    private int maxFulltextMatchesReturned;
    private int maxFulltextMatchesRanked;
    private int maxQueryTerms;
    private int maxQueryExtraTerms;

    /** Cache in servlet context. */
    public static NutchBean get(ServletContext app, Configuration conf) throws IOException {
        NutchBean bean = (NutchBean) app.getAttribute("nutchBean");
        if (bean == null) {
            //if (LOG.isInfoEnabled()) { 
            LOG.info("creating new bean");
            //}
            bean = new NutchBean(conf);
            app.setAttribute("nutchBean", bean);
        }
        return bean;
    }

    /**
     * 
     * @param conf
     * @throws IOException
     */
    public NutchBean(Configuration conf) throws IOException {
        this(conf, null, null);
    }

    /**
     *  Construct in a named directory. 
     * @param conf
     * @param dir
     * @throws IOException
     */
    public NutchBean(Configuration conf, Path dir, File blacklistFile) throws IOException {
        this.conf = conf;
        this.fs = FileSystem.get(this.conf);
        if (dir == null) {
            dir = new Path(this.conf.get("searcher.dir", "crawl"));
        }
        Path servers = new Path(dir, "search-servers.txt");
        if (fs.exists(servers)) {
            LOG.info("searching servers in " + servers);
            init(new DistributedSearch.Client(servers, conf));
        } else {
            init(new Path(dir, "index"), new Path(dir, "indexes"), new Path(dir, "segments"),
                    new Path(dir, "linkdb"), blacklistFile);
        }

        this.maxFulltextMatchesReturned = conf.getInt(Global.MAX_FULLTEXT_MATCHES_RETURNED, -1);
        this.maxFulltextMatchesRanked = conf.getInt(Global.MAX_FULLTEXT_MATCHES_RANKED, -1);
        this.maxQueryTerms = conf.getInt(Global.MAX_QUERY_TERMS, -1);
        this.maxQueryExtraTerms = conf.getInt(Global.MAX_QUERY_EXTRA_TERMS, -1);
    }

    private void init(Path indexDir, Path indexesDir, Path segmentsDir, Path linkDb, File blacklistFile)
            throws IOException {

        IndexSearcher indexSearcher;
        if (this.fs.exists(indexDir)) {
            LOG.info("opening merged index in " + indexDir);
            indexSearcher = new IndexSearcher(indexDir, this.conf, blacklistFile);
        } else {
            LOG.info("opening indexes in " + indexesDir);

            Vector vDirs = new Vector();
            Path[] directories = fs.listPaths(indexesDir);
            for (int i = 0; i < fs.listPaths(indexesDir).length; i++) {
                Path indexdone = new Path(directories[i], Indexer.DONE_NAME);
                if (fs.isFile(indexdone)) {
                    vDirs.add(directories[i]);
                }
            }

            directories = new Path[vDirs.size()];
            for (int i = 0; vDirs.size() > 0; i++) {
                directories[i] = (Path) vDirs.remove(0);
            }

            indexSearcher = new IndexSearcher(directories, this.conf, blacklistFile);
        }

        LOG.info("opening segments in " + segmentsDir);
        FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(), this.conf);

        this.segmentNames = segments.getSegmentNames();

        this.searcher = indexSearcher;
        this.detailer = indexSearcher;
        this.summarizer = segments;
        this.content = segments;

        LOG.info("opening linkdb in " + linkDb);
        this.linkDb = new LinkDbInlinks(fs, linkDb, this.conf);
    }

    private void init(DistributedSearch.Client client) {
        this.segmentNames = client.getSegmentNames();
        this.searcher = client;
        this.detailer = client;
        this.summarizer = client;
        this.content = client;
        this.linkDb = client;
    }

    public String[] getSegmentNames() {
        return segmentNames;
    }

    public Hits search(Query query, int numHits) throws IOException {
        return search(query, numHits, null, null, false);
    }

    public Hits search(Query query, int numHits, String dedupField, String sortField, boolean reverse)
            throws IOException {

        return searcher.search(query, numHits, dedupField, sortField, reverse);
    }

    private class DupHits extends ArrayList {
        private boolean maxSizeExceeded;
    }

    /** Search for pages matching a query, eliminating excessive hits from the
     * same site.  Hits after the first <code>maxHitsPerDup</code> from the same
     * site are removed from results.  The remaining hits have {@link
     * Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero then all
     * hits are returned.
     * 
     * @param query query
     * @param numHits number of requested hits
     * @param maxHitsPerDup the maximum hits returned with matching values, or zero
     * @return Hits the matching hits
     * @throws IOException
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup) throws IOException {
        return search(query, numHits, maxHitsPerDup, "site", null, false, false);
    }

    /** Search for pages matching a query, eliminating excessive hits with
     * matching values for a named field.  Hits after the first
     * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
     * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
     * then all hits are returned.
     * 
     * @param query query
     * @param numHits number of requested hits
     * @param maxHitsPerDup the maximum hits returned with matching values, or zero
     * @param dedupField field name to check for duplicates
     * @return Hits the matching hits
     * @throws IOException
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField) throws IOException {
        return search(query, numHits, maxHitsPerDup, dedupField, null, false, false);
    }

    /** Search for pages matching a query, eliminating excessive hits with
     * matching values for a named field.  Hits after the first
     * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
     * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
     * then all hits are returned.
     * 
     * @param query query
     * @param numHits number of requested hits
     * @param searcherMaxHits number of matched documents for ranking, or MATCHED_DOCS_CONST_IGNORE to ignore   
     * @param maxHitsPerDup the maximum hits returned with matching values, or zero
     * @param dedupField field name to check for duplicates
     * @param sortField Field to sort on (or null if no sorting).
     * @param reverse True if we are to reverse sort by <code>sortField</code>.
     * @param functions Extra parameters   
     * @param maxHitsPerVersion maximum hits returned with the same url and different version
     * @return Hits the matching hits
     * @throws IOException
     */
    public Hits search(Query query, int numHits, int searcherMaxHits, int maxHitsPerDup, String dedupField,
            String sortField, boolean reverse, PwaFunctionsWritable functions, int maxHitsPerVersion)
            throws IOException {
        return search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions,
                maxHitsPerVersion, false);
    }

    /** Search for pages matching a query, eliminating excessive hits with
     * matching values for a named field.  Hits after the first
     * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
     * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
     * then all hits are returned.
     * 
     * @param query query
     * @param numHits number of requested hits
     * @param searcherMaxHits number of matched documents for ranking, or MATCHED_DOCS_CONST_IGNORE to ignore   
     * @param maxHitsPerDup the maximum hits returned with matching values, or zero
     * @param dedupField field name to check for duplicates
     * @param sortField Field to sort on (or null if no sorting).
     * @param reverse True if we are to reverse sort by <code>sortField</code>.
     * @param functions Extra parameters    
     * @param maxHitsPerVersion maximum hits returned with the same url and different version
     * @param waybackQuery if true it is a query from wayback; otherwise it is from nutchwax
     * @return Hits the matching hits
     * @throws IOException
     */
    public Hits search(Query query, int numHits, int searcherMaxHits, int maxHitsPerDup, String dedupField,
            String sortField, boolean reverse, PwaFunctionsWritable functions, int maxHitsPerVersion,
            boolean waybackQuery) throws IOException {

        Hits hits = null;
        if (waybackQuery) {
            hits = searcher.search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse,
                    functions, maxHitsPerVersion);
            hits.setTotalIsExact(true);
            return hits;
        }

        // check maximum value of variables
        if (numHits > maxFulltextMatchesReturned) {
            numHits = maxFulltextMatchesReturned;
        }
        if (searcherMaxHits > maxFulltextMatchesRanked) {
            searcherMaxHits = maxFulltextMatchesRanked;
        }

        // limit query terms for full-text queries
        query = limitTerms(query);

        int numHitsRaw;
        float rawHitsFactor;
        if (maxHitsPerDup <= 0) {
            if (searcherMaxHits == MATCHED_DOCS_CONST_IGNORE && functions == null) {
                return searcher.search(query, numHits, dedupField, sortField, reverse);
            } else {
                return searcher.search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField,
                        reverse, functions, maxHitsPerVersion);
            }
        } else {
            rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
            numHitsRaw = (int) (numHits * rawHitsFactor);

            LOG.debug("searching for " + numHitsRaw + " raw hits");
            hits = searcher.search(query, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField,
                    reverse, functions, maxHitsPerVersion); // the same method for all values of searcherMaxHits
        }

        boolean lastRequest = false;
        if (numHitsRaw > hits.getTotal()) { // BUG 200608 - do no request continuously until it have numHits if the match has a smaller number of hits
            lastRequest = true;
        }

        // remove duplicates block
        long total = hits.getTotal();
        Map dupToHits = new HashMap();
        List resultList = new ArrayList();
        Set seen = new HashSet();
        List excludedValues = new ArrayList();
        boolean totalIsExact = true;
        for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
            // get the next raw hit
            if (rawHitNum >= hits.getLength()) {

                if (lastRequest) { // BUG 200608
                    break;
                }

                // optimize query by prohibiting more matches on some excluded values
                Query optQuery = (Query) query.clone();
                for (int i = 0; i < excludedValues.size(); i++) {
                    if (i == MAX_PROHIBITED_TERMS)
                        break;
                    optQuery.addProhibitedTerm(((String) excludedValues.get(i)), dedupField);
                }
                numHitsRaw = (int) (numHitsRaw * rawHitsFactor);
                //if (LOG.isInfoEnabled()) {
                LOG.debug("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery);
                //}
                // hits = searchAux(optQuery, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse);  // for TREC 
                hits = searcher.search(optQuery, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField,
                        reverse, functions, maxHitsPerVersion);
                if (numHitsRaw > hits.getTotal()) { // BUG 200608
                    lastRequest = true;
                }

                //if (LOG.isInfoEnabled()) {
                LOG.debug("found " + hits.getTotal() + " raw hits");
                //}
                rawHitNum = -1;
                continue;
            }

            Hit hit = hits.getHit(rawHitNum);
            if (seen.contains(hit)) // processed in the previous query
                continue;
            seen.add(hit);

            // get dup hits for its value
            String value = hit.getDedupValue();
            DupHits dupHits = (DupHits) dupToHits.get(value);
            if (dupHits == null) {
                dupToHits.put(value, dupHits = new DupHits());
            }

            // does this hit exceed maxHitsPerDup?
            if (dupHits.size() == maxHitsPerDup) { // yes -- then ignore the hit 
                if (!dupHits.maxSizeExceeded) {

                    // mark prior hits with moreFromDupExcluded
                    for (int i = 0; i < dupHits.size(); i++) {
                        ((Hit) dupHits.get(i)).setMoreFromDupExcluded(true);
                    }
                    dupHits.maxSizeExceeded = true;

                    excludedValues.add(value); // exclude dup
                }
                totalIsExact = false;
            } else { // no -- then collect the hit
                resultList.add(hit);
                dupHits.add(hit);

                // are we done?
                // we need to find one more than asked for, so that we can tell if
                // there are more hits to be shown
                if (resultList.size() > numHits)
                    break;
            }
        }

        Hits results = new Hits(total, (Hit[]) resultList.toArray(new Hit[resultList.size()]));
        results.setTotalIsExact(totalIsExact);
        return results;
    }

    /**
     * Limit number of query terms and extra query terms
     * @param input
     * @param output
     */
    public Query limitTerms(Query input) {
        Query output = new Query(input.getConf());
        Clause[] clauses = input.getClauses();
        int termsCounter = 0;
        int termsExtraCounter = 0;

        for (int i = 0; i < clauses.length; i++) {
            Clause c = clauses[i];

            if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited() && termsCounter >= maxQueryTerms) { // is it is a term and reached the limiti
                continue;
            }
            if ((!c.getField().equals(Clause.DEFAULT_FIELD) || c.isProhibited())
                    && termsExtraCounter >= maxQueryExtraTerms) // it is an exstra term or a not
                continue;

            if (c.isPhrase()) {
                Term[] terms = c.getPhrase().getTerms();

                int newLength = terms.length;
                if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited()) {
                    if (terms.length + termsCounter > maxQueryTerms) {
                        newLength = maxQueryTerms - termsCounter;
                        termsCounter += newLength;
                    } else {
                        termsCounter += terms.length;
                    }
                } else {
                    if (terms.length + termsExtraCounter > maxQueryExtraTerms) {
                        newLength = maxQueryExtraTerms - termsExtraCounter;
                        termsExtraCounter += newLength;
                    } else {
                        termsExtraCounter += terms.length;
                    }
                }

                if (newLength != terms.length) {
                    if (newLength == 1) {
                        output.addClause(new Clause(terms[0], c.isRequired(), c.isProhibited(), c.getConf()));
                    } else {
                        Term[] newTerms = new Term[newLength];
                        System.arraycopy(terms, 0, newTerms, 0, newLength);
                        output.addClause(
                                new Clause(new Phrase(newTerms), c.isRequired(), c.isProhibited(), c.getConf()));
                    }
                } else {
                    output.addClause(c);
                }
            } else {
                output.addClause(c);
                if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited()) {
                    termsCounter++;
                } else {
                    termsExtraCounter++;
                }
            }
        }

        return output;
    }

    /**
     * @param searcherMaxHits  
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField,
            boolean reverse, boolean waybackQuery) throws IOException {
        return search(query, numHits, MATCHED_DOCS_CONST_IGNORE, maxHitsPerDup, dedupField, sortField, reverse,
                null, Integer.MAX_VALUE, waybackQuery);
    }

    public String getExplanation(Query query, Hit hit, PwaFunctionsWritable functions) throws IOException {
        return searcher.getExplanation(query, hit, functions);
    }

    public String getExplanation(Query query, Hit hit) throws IOException {
        return searcher.getExplanation(query, hit, null);
    }

    public HitDetails getDetails(Hit hit) throws IOException {
        return detailer.getDetails(hit);
    }

    public HitDetails[] getDetails(Hit[] hits) throws IOException {
        return detailer.getDetails(hits);
    }

    /* BUG wayback 0000155 */
    public HitDetails[] getDetails(PwaRequestDetailsWritable details) throws IOException {
        return detailer.getDetails(details);
    }

    public Summary getSummary(HitDetails hit, Query query) throws IOException {
        return summarizer.getSummary(hit, query);
    }

    public Summary[] getSummary(HitDetails[] hits, Query query) throws IOException {
        return summarizer.getSummary(hits, query);
    }

    /* BUG nutchwax 0000616 */
    public Summary[] getSummary(PwaRequestSummaryWritable summaries) throws IOException {
        return summarizer.getSummary(summaries);
    }

    public byte[] getContent(HitDetails hit) throws IOException {
        return content.getContent(hit);
    }

    public ParseData getParseData(HitDetails hit) throws IOException {
        return content.getParseData(hit);
    }

    public ParseText getParseText(HitDetails hit) throws IOException {
        return content.getParseText(hit);
    }

    public String[] getAnchors(HitDetails hit) throws IOException {
        return linkDb.getAnchors(hit);
    }

    public Inlinks getInlinks(HitDetails hit) throws IOException {
        return linkDb.getInlinks(hit);
    }

    public long getFetchDate(HitDetails hit) throws IOException {
        return content.getFetchDate(hit);
    }

    public void close() throws IOException {
        if (content != null) {
            content.close();
        }
        if (searcher != null) {
            searcher.close();
        }
        if (linkDb != null) {
            linkDb.close();
        }
        if (fs != null) {
            fs.close();
        }
    }

    /** For debugging. */
    public static void main(String[] args) throws Exception {
        String usage = "NutchBean query";

        if (args.length == 0) {
            System.err.println(usage);
            System.exit(-1);
        }

        Configuration conf = NutchConfiguration.create();
        NutchBean bean = new NutchBean(conf);
        Query query = Query.parse(args[0], conf);
        Hits hits = bean.search(query, 10);
        System.out.println("Total hits: " + hits.getTotal());
        int length = (int) Math.min(hits.getTotal(), 10);
        Hit[] show = hits.getHits(0, length);
        HitDetails[] details = bean.getDetails(show);
        Summary[] summaries = bean.getSummary(details, query);

        for (int i = 0; i < hits.getLength(); i++) {
            System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
        }
    }

    public long getProtocolVersion(String className, long arg1) throws IOException {
        if (DistributedSearch.Protocol.class.getName().equals(className)) {
            return 1;
        } else {
            throw new IOException("Unknown Protocol classname:" + className);
        }
    }

}