net.yacy.search.AutoSearch.java Source code

Java tutorial

Introduction

Here is the source code for net.yacy.search.AutoSearch.java

Source

/**
 *  AutoSearch.java
 *  Copyright 2015 by Burkhard Buelte
 *  First released 09.01.2015 at http://yacy.net
 *
 *  This is a part of YaCy, a peer-to-peer based web search engine
 *
 *  LICENSE
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.search;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import static net.yacy.cora.federate.opensearch.SRURSSConnector.loadSRURSS;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.peers.Seed;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;

/**
 * AutoSearch retrieves queries from Bookmarks or a property file (if existing)
 * and loops to a list of connected peers and asks each for results which are
 * added to the local index.
 */
public class AutoSearch extends AbstractBusyThread {

    private Set<String> querystack; // serach query
    public String currentQuery = null; // current query
    private Set<String> currentTargets = null; // peer hashes
    final Switchboard sb;
    public int gotresults;
    private long lastInitTime; // to recognize new data (Bookmarks) to import

    public AutoSearch(Switchboard xsb) {
        super(3000, 1000); // set lower limits of cycle delay
        this.setIdleSleep(60000); // set actual cycle delays
        this.setBusySleep(10000);
        this.sb = xsb;

        gotresults = 0;
        querystack = new HashSet<String>();

        this.lastInitTime = System.currentTimeMillis() - 600000; // init to now - 10 min
        if (!checkBookmarkDB()) {
            try {
                // check for old queries in temp property file
                File pfile = new File(xsb.dataPath, "DATA/SETTINGS/autosearch.conf");
                if (pfile.exists()) {
                    ConcurrentLog.info(AutoSearch.class.getName(),
                            "read queries from file " + pfile.getAbsolutePath());
                    Properties prop = new Properties();
                    FileInputStream fileIn = new FileInputStream(pfile);
                    prop.load(fileIn);
                    if (prop.size() > 0) {
                        Set<Object> all = prop.keySet();
                        for (Object s : all) {
                            String query = prop.getProperty((String) s);
                            if (query != null && !query.isEmpty()) {
                                querystack.add(query);
                            }
                        }
                    }
                    fileIn.close();
                }
            } catch (final IOException e) {
                ConcurrentLog.warn(AutoSearch.class.getName(), "Error reading config file");
            }
        }
    }

    /**
     * Save current queries to a (temporary) property file to allow continue
     * after a restart. Existing file will be overwritten or deleted.
     */
    private void saveasPropFile() {
        File pfile = new File(sb.dataPath, "DATA/SETTINGS/autosearch.conf");
        if (querystack.size() == 0) {
            if (pfile.exists()) {
                pfile.delete();
            }
        } else {
            try {
                Properties prop = new Properties();
                for (String s : querystack) {
                    prop.put("query" + s.hashCode(), s);
                }
                OutputStream fileOut = new FileOutputStream(pfile);
                prop.store(fileOut, "AutoSearch query list");
                fileOut.close();
            } catch (FileNotFoundException ex) {
                ConcurrentLog.warn(AutoSearch.class.getName(), "can not create file " + pfile.getAbsolutePath());
            } catch (IOException ex) {
                ConcurrentLog.warn(AutoSearch.class.getName(),
                        "IO error writing to file " + pfile.getAbsolutePath());
            }
        }
    }

    /**
     * Get peers to query (peers connected)
     *
     * @return Set of peer hashes to contact
     */
    private void initPeerList() {
        if (currentTargets == null) {
            currentTargets = new HashSet<String>();
        }
        // TODO: DHT peers could be excluded
        Iterator<Seed> it = Switchboard.getSwitchboard().peers.seedsConnected(true, false, null, 0);
        while (it.hasNext()) {
            Seed s = it.next();
            currentTargets.add(s.hash);
        }
    }

    /**
     * Check BookmarkDB for existing queries return true if new entry added to
     * query queue. Store queries in (temporary) property file
     *
     * @return true if new query from bookmark was added
     */
    private boolean checkBookmarkDB() {
        int added = 0;
        Iterator<Bookmark> it = Switchboard.getSwitchboard().bookmarksDB.getBookmarksIterator();
        if (it != null) {
            while (it.hasNext()) {
                Bookmark bmk = it.next();
                // get search bookmarks only
                if (bmk.getFoldersString().startsWith("/search")) {
                    // take only new created or edited bookmarks
                    if (bmk.getTimeStamp() >= this.lastInitTime) {
                        final String query = bmk.getQuery();
                        if (query != null && !query.isEmpty()) {
                            {
                                querystack.add(query);
                                added++;
                                ConcurrentLog.info(AutoSearch.class.getName(),
                                        "add query from Bookmarks: query=" + query);
                            }
                        }
                    }
                }
            }
        }
        if (added > 0) {
            this.lastInitTime = System.currentTimeMillis();
            saveasPropFile();
            return true;
        }
        return false;
    }

    /**
     * Process query queue, select one query and peer to ask next
     *
     * @return true if something processed
     */
    @Override
    public boolean job() {

        if (currentQuery == null && querystack != null && querystack.size() > 0) {
            currentQuery = querystack.iterator().next();
            querystack.remove(currentQuery); // imediate remove to asure no repeat
            initPeerList(); // late initialization of peerlist to get currently connected
        }

        // ask next peer for search term
        if (currentQuery != null && !currentQuery.isEmpty()) {
            if (currentTargets != null && !currentTargets.isEmpty()) {
                while (currentTargets.size() > 0) { // loop only to skip disconnected peers
                    String peerhash = currentTargets.iterator().next();
                    currentTargets.remove(peerhash);
                    Seed seed = Switchboard.getSwitchboard().peers.getConnected(peerhash);
                    if (seed != null) {
                        processSingleTarget(seed);
                        return true; // just one query per busycycle is intended
                    }
                }
            }
            currentQuery = null;
        }

        // no search targets 
        checkBookmarkDB();

        // TODO: do idle processing
        // analyse content of local index
        // extend search with learned new search terms
        // follow most promising links
        ConcurrentLog.fine(AutoSearch.class.getName(), "nothing to do");
        return this.querystack.size() > 0;
    }

    /**
     * Calls one peer for search results of the current query and adds it to the
     * local index. Depending on peers SolrAvailable flag the a solr query or
     * opensearch/rss query is used.
     *
     * @param seed the peer to ask
     */
    private void processSingleTarget(Seed seed) {
        ConcurrentLog.fine(AutoSearch.class.getName(),
                "ask " + seed.getIP() + " " + seed.getName() + " for query=" + currentQuery);

        if (seed.getFlagSolrAvailable()) { // do a solr query
            SolrDocumentList docList = null;
            SolrQuery solrQuery = new SolrQuery();
            // use remote defaults and ranking (to query their index right)
            solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special
            solrQuery.set("q.op", "AND"); // except ... no one word matches please
            solrQuery.set(CommonParams.ROWS, "20");
            this.setName("Protocol.solrQuery(" + solrQuery.getQuery() + " to " + seed.hash + ")");
            try {
                RemoteInstance instance = new RemoteInstance(
                        "http://" + seed.getPublicAddress(seed.getIP()) + "/solr/", null, null, 10000); // this is a 'patch configuration' which considers 'solr' as default collection
                try {
                    SolrConnector solrConnector = new RemoteSolrConnector(instance, true, null);
                    if (!solrConnector.isClosed()) {
                        try {
                            QueryResponse rsp = solrConnector.getResponseByParams(solrQuery);
                            docList = rsp.getResults();
                        } catch (Throwable e) {
                        } finally {
                            solrConnector.close();
                        }
                    }
                } catch (Throwable ee) {
                } finally {
                    instance.close();
                }
                if (docList != null) {
                    for (SolrDocument d : docList) {
                        sb.index.fulltext()
                                .putDocument(sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(d));
                        this.gotresults++;
                    }
                    ConcurrentLog.info(AutoSearch.class.getName(), "added " + docList.size() + " results from "
                            + seed.getName() + " to index for solrquery=" + currentQuery);
                }
            } catch (Throwable eee) {
            }
        } else { // do a yacysearch.rss query
            final String rssSearchServiceURL = "http://" + seed.getPublicAddress(seed.getIP()) + "/yacysearch.rss";
            try {
                RSSFeed feed = loadSRURSS(rssSearchServiceURL, currentQuery, 0, 20, CacheStrategy.IFFRESH, false, // just local, as we ask others too
                        ClientIdentification.yacyInternetCrawlerAgent);
                final List<DigestURL> urls = new ArrayList<DigestURL>();
                for (final MultiProtocolURL entry : feed.getLinks()) {
                    urls.add(new DigestURL(entry, (byte[]) null));
                    this.gotresults++;
                }
                sb.addToIndex(urls, null, "AutoSearch", null, true);
                ConcurrentLog.info(AutoSearch.class.getName(), "added " + urls.size() + " results from "
                        + seed.getName() + " to index for query=" + currentQuery);
            } catch (IOException ex) {
                ConcurrentLog.info(AutoSearch.class.getName(), "no answer from " + seed.getName());
            }
        }
    }

    /**
     * Estimate of queries to perform
     */
    @Override
    public int getJobCount() {
        if (currentTargets != null) {
            int cnt = currentTargets.size();
            cnt += querystack.size() * sb.peers.sizeConnected();
            return cnt;
        }
        return 0;
    }

    @Override
    public void freemem() {
    }

    @Override
    public void close() {
        this.saveasPropFile(); // saves or deletes property file with queries
    }
}