Crawler.java :  » Content-Management-System » contelligent » de » finix » contelligent » search » engine » Java Open Source

Java Open Source » Content Management System » contelligent 
contelligent » de » finix » contelligent » search » engine » Crawler.java
/*
 * Copyright 2001-2006 C:1 Financial Services GmbH
 *
 * This software is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License Version 2.1, as published by the Free Software Foundation.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA
 */

package de.finix.contelligent.search.engine;

import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;

import org.apache.lucene.document.Document;

import de.finix.contelligent.CallData;
import de.finix.contelligent.ComponentManager;
import de.finix.contelligent.ComponentPath;
import de.finix.contelligent.Session;
import de.finix.contelligent.core.ContelligentImpl;
import de.finix.contelligent.core.security.ContelligentSecurityManager;
import de.finix.contelligent.logging.LoggingService;
import de.finix.contelligent.xml.elements.IndexBuilderElement;
import de.finix.contelligent.xml.elements.IndexBuilderFilterElement;

class Crawler {
    final static org.apache.log4j.Logger log = LoggingService.getLogger(Crawler.class);

    private ComponentPath root;

    private Map includes = new HashMap();

    private Map categoryValues;

    private Map renderParameters;

    private LuceneIndex index;

    Collection filters;

    public Crawler(LuceneIndex index, IndexBuilderElement element) {
        this(new ComponentPath(element.getDir()), element.getIncludes(), element.getCategoryValues(), element
                .getRenderParameters(), index, element.getFilters());
    }

    protected Crawler(ComponentPath root, Map includes, Map categoryValues, Map renderParameters, LuceneIndex index,
            Collection filters) {
        this.root = root;
        this.index = index;
        this.filters = filters;

        if (includes.isEmpty()) {
            this.includes.put("contelligent.website.Page", "");
        } else {
            this.includes.putAll(includes);
        }
        this.categoryValues = categoryValues;
        if (categoryValues == null) {
            this.categoryValues = Collections.EMPTY_MAP;
        }
        this.renderParameters = renderParameters;
        if (renderParameters == null) {
            this.renderParameters = Collections.EMPTY_MAP;
        }
    }

    public void run() {
        Session session = null;
        try {
            final ComponentManager cm = ContelligentImpl.getInstance().getRootComponentManager();
            session = ContelligentImpl.getInstance().beginSession(ContelligentSecurityManager.getIndexUser(), cm);
            CallData callData = ContelligentImpl.getInstance().createCallData(session);

            Collection paths = cm.getComponentsInSubtreeFilteredByType(root, includes.keySet());

            final Collection filteredPaths = filterPaths(paths, callData);

            final LuceneDocumentFactory documentFactory;
            if (index.isRender()) {
                documentFactory = new RenderingDocumentFactory(categoryValues, renderParameters, includes, cm, session,
                        callData, ContelligentImpl.getInstance().getCategoryManager());
            } else {
                documentFactory = new RawDocumentFactory(cm);
            }

            index.apply(new LuceneIndexAppender() {

                public void perform(LuceneIndexAppenderAdapter adapter) {

                    Iterator iterator = filteredPaths.iterator();

                    while (iterator.hasNext()) {
                        ComponentPath componentPath = (ComponentPath) iterator.next();
                        try {
                            Iterator documents = documentFactory.createDocuments(componentPath).iterator();

                            while (documents.hasNext()) {
                                Document document = (Document) documents.next();
                                adapter.add(document);
                            }
                        } catch (Exception e) {
                            log.warn("Failed to add component " + componentPath + " to index (" + e.getMessage() + ")");
                            if (log.isDebugEnabled()) {
                                log.debug("Failed to add component to index", e);
                            }
                        }
                    }
                }
            });
        } catch (Exception e) {
            log.error("run() failed", e);
        } finally {
            if (session != null) {
                ContelligentImpl.getInstance().invalidateSession(session);
            }
        }
    }

    /**
     * @param paths
     * @param callData
     * @return
     */
    private Collection filterPaths(Collection paths, CallData callData) {

        HashSet filteredPaths = new HashSet();

        if (filters.size() == 0)
            return paths;

        Iterator f = filters.iterator();

        while (f.hasNext()) {
            IndexBuilderFilterElement filterElement = (IndexBuilderFilterElement) f.next();

            try {
                CrawlerFilter filter = FilterEngine.getInstance().getFilterInstance(filterElement);

                filteredPaths.addAll(applyFilterToPaths(filter, paths, filterElement, callData));

            } catch (FilterException e) {
                log.warn("CrawlerFilter Implementation could not be resolved '" + filterElement.getImpl() + "'", e);
                continue;
            }
        }

        if (log.isDebugEnabled()) {
            log.debug("filterPaths() - filtered paths: '" + paths + "'  =>  '" + filteredPaths + "'");
        }

        return filteredPaths;
    }

    /**
     * @param filter
     * @param filteredPaths
     * @throws FilterException
     */
    private Collection applyFilterToPaths(CrawlerFilter filter, Collection pathsToFilter,
            IndexBuilderFilterElement filterConfig, CallData callData) throws FilterException {
        HashSet result = new HashSet();

        Iterator i = pathsToFilter.iterator();
        while (i.hasNext()) {
            ComponentPath path = (ComponentPath) i.next();
            if (filter.filter(path, filterConfig, callData)) {
                result.add(path);
            }
        }
        return result;
    }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.