org.wandora.indexer.AbstractIndexBuilder.java Source code

Introduction

Here is the source code for org.wandora.indexer.AbstractIndexBuilder.java
Source

/*
 * WANDORA
 * Knowledge Extraction, Management, and Publishing Application
 * http://wandora.org
 * 
 * Copyright (C) 2004-2016 Wandora Team
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *
 * 
 *
 * AbstractIndexBuilder.java
 *
 * Created on 8. elokuuta 2006, 16:07
 *
 */

package org.wandora.indexer;

import java.util.*;
import java.io.*;
import org.wandora.topicmap.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.wandora.application.WandoraToolLogger;

import org.wandora.piccolo.*;

/**
 *
 * @author olli
 */
public abstract class AbstractIndexBuilder {

    protected HashMap extractors;
    protected Logger logger;
    protected WandoraToolLogger toolLogger;

    /** Creates new AbstractIndexBuilder */
    public AbstractIndexBuilder() {
        this(new SimpleLogger());
    }

    public AbstractIndexBuilder(Logger logger) {
        this.logger = logger;
        extractors = new HashMap();
    }

    public AbstractIndexBuilder(WandoraToolLogger logger) {
        this.toolLogger = logger;
        extractors = new HashMap();
    }

    public void addExtractor(Extractor e) {
        String[] cts = e.getContentTypes();
        for (int i = 0; i < cts.length; i++) {
            extractors.put(cts[i], e);
        }
    }

    public Analyzer getWriterAnalyzer() {
        //        PerFieldAnalyzerWrapper analyzer=new PerFieldAnalyzerWrapper(new StandardAnalyzer());
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new Analyzer() {
            public TokenStream tokenStream(String fieldName, Reader reader) {
                return new LowerCaseFilter(new StopFilter(
                        //                new LetterTokenizer(reader)
                        new CharTokenizer(reader) {
                            protected boolean isTokenChar(char c) {
                                if (Character.isLetter(c) || Character.isDigit(c))
                                    return true;
                                if (c == ':' || c == '-')
                                    return true;
                                else
                                    return false;
                            }

                            protected char normalize(char c) {
                                if (c == ':' || c == '-')
                                    return c;
                                else
                                    return super.normalize(c);
                            }
                        }, StandardAnalyzer.STOP_WORDS));
            }
        });
        analyzer.addAnalyzer("topic", new NewlineAnalyzer());
        return analyzer;
    }

    /*    public void processTopicMap(String tm,String index) throws IOException {
    processTopicMap(WandoraManager.readTopicMap(tm),index);
        }
        public void processTopicMap(String tm,IndexWriter writer) throws IOException{
    WandoraManager manager=new WandoraManager();
    processTopicMap(WandoraManager.readTopicMap(tm),writer);
        }*/

    public void processTopicMap(TopicMap tm, String index) throws IOException {
        IndexWriter writer = null;
        Analyzer analyzer = null;
        File indexf = null;
        try {
            analyzer = getWriterAnalyzer();
            indexf = new File(index);
            writer = new IndexWriter(indexf, analyzer, true);
            log("Indexing topic map!");
            processTopicMap(tm, writer);
            log("Optimizing index!");
            writer.optimize();
            log("Indexing done!");
        } catch (Exception e) {
            log("Exception with analyzer '" + analyzer + "'.");
            log("Exception with topicmap '" + tm + "'.");
            log("Exception with search index '" + index + "'.");
            log("Exception with search index file '" + indexf + "'.");
            log("Exception with search writer '" + writer + "'.");
            log(e);
        }

        try {
            writer.close();
        } catch (Exception e) {
            log(e);
        }
    }

    protected String getKeyNames(Topic t) throws TopicMapException {
        StringBuilder keywords = new StringBuilder();

        if (t.getBaseName() != null) {
            keywords.append(t.getBaseName());
            StringTokenizer st = new StringTokenizer(t.getBaseName(), ",-. ");
            if (st.countTokens() > 0) {
                while (st.hasMoreTokens()) {
                    keywords.append(", ");
                    keywords.append(st.nextToken());
                }
            }
        }

        for (Set variantScope : t.getVariantScopes()) {
            String n = t.getVariant(variantScope);
            if (keywords.length() > 0)
                keywords.append(", ");
            keywords.append(n);
        }
        return keywords.toString();
    }

    public String getTopicSubjectIndicator(Topic t) throws TopicMapException {
        org.wandora.topicmap.Locator l = t.getOneSubjectIdentifier();
        if (l == null)
            return null;
        else
            return l.toExternalForm();
    }

    public Set getTopicSubjectIndicators(Topic t) throws TopicMapException {
        Set s = new HashSet();
        for (org.wandora.topicmap.Locator si : t.getSubjectIdentifiers()) {
            s.add(si.toExternalForm());
        }
        return s;
    }

    // ----------------------------------

    public void processTopic(String topicSI, TopicMap tm, IndexWriter writer)
            throws IOException, TopicMapException {
        processTopic(tm.getTopic(topicSI), writer);
    }

    public abstract void processTopic(Topic topic, IndexWriter writer) throws IOException, TopicMapException;

    public boolean includeTopic(Topic t) {
        return true;
    }

    public void processTopicMap(TopicMap tm, IndexWriter writer) throws IOException, TopicMapException {
        Iterator iter = tm.getTopics();
        int count = 0;
        while (iter.hasNext()) {
            try {
                Topic topic = (Topic) iter.next();
                if (!includeTopic(topic))
                    continue;
                org.wandora.topicmap.Locator l = topic.getOneSubjectIdentifier();
                log("Processing topic (" + (count + 1) + ") " + l);
                processTopic(topic, writer);
                count++;
            } catch (Exception e) {
                // Unable to process topic!
                log(e);
            }
        }
    }

    public void removeTopicFromIndex(String si, IndexReader reader) throws IOException {
        reader.deleteDocuments(new Term("topic", si));
    }

    public void removeTopicsFromIndex(Set topics, IndexReader reader) throws IOException {
        Iterator iter = topics.iterator();
        while (iter.hasNext()) {
            try {
                removeTopicFromIndex((String) iter.next(), reader);
            } catch (Exception e) {
                // Unable to remove topic!
                log(e);
            }
        }
    }

    public Set getDependentTopics(String topic, IndexReader reader) throws IOException {
        TermDocs docs = reader.termDocs(new Term("topic", topic));
        while (docs.next()) {
            Document doc = reader.document(docs.doc());
            String type = doc.get("type");
            if (type != null && type.toString().equals("topic")) {
                String d = doc.get("dependent");
                StringTokenizer st = new StringTokenizer(d, "\n");
                Set s = new HashSet();
                while (st.hasMoreTokens()) {
                    s.add(st.nextToken());
                }
                return s;
            }
        }
        return null;
    }

    public void updateTopics(Set topics, Set topicsNoDependent, String index, TopicMap tm)
            throws IOException, TopicMapException {
        Iterator iter = topics.iterator();
        Set delete = new HashSet(); // put all deleted topics here, these are the topics that will need to be remade after deletion
        // first collect all topics that must be deleted before actually deleting anything. otherwise we might be unable to get dependent data for all topics.
        IndexReader reader = IndexReader.open(index);
        while (iter.hasNext()) {
            String topic = (String) iter.next();
            delete.add(topic);
            Set dep = getDependentTopics(topic, reader);
            if (dep != null) {
                Iterator iter2 = dep.iterator();
                while (iter2.hasNext()) {
                    String topic2 = (String) iter2.next();
                    delete.add(topic2);
                }
            }
        }
        iter = topicsNoDependent.iterator();
        while (iter.hasNext()) {
            String topic = (String) iter.next();
            delete.add(topic);
        }

        // next delete topics
        iter = delete.iterator();
        while (iter.hasNext()) {
            String topic = (String) iter.next();
            int count = reader.deleteDocuments(new Term("topic", topic));
        }
        reader.close();
        // now remake everything we deleted
        Set processed = new HashSet(); // collect updated topic ids here so that we don't update the same topic twice
        IndexWriter writer = new IndexWriter(new File(index), getWriterAnalyzer(), false);
        iter = delete.iterator();
        while (iter.hasNext()) {
            String topic = (String) iter.next();
            Topic t = tm.getTopic(topic);
            if (t == null)
                continue;
            if (!processed.contains(t)) {
                //                logger.writelog("DBG","Updating topic '"+topic+"'");
                processed.add(t);
                processTopic(t, writer);
            }
        }
        writer.optimize();
        writer.close();
    }

    public static Document buildDocument(Set<String> sisSet, String type, String name, String keywords, String text,
            String url, String dependent) {
        StringBuilder sis = new StringBuilder("");
        for (String si : sisSet) {
            sis.append(si);
            sis.append("\n");
        }
        return buildDocument(sis.toString(), type, name, keywords, text, url, dependent);
    }

    public static Document buildDocument(String sis, String type, String name, String keywords, String text,
            String url, String dependent) {
        Document doc = new Document();

        doc.add(new Field("topic", sis, Field.Store.YES, Field.Index.TOKENIZED)); // special tokenization for subject indicators
        doc.add(new Field("type", type, Field.Store.YES, Field.Index.TOKENIZED));
        doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));
        doc.add(new Field("keyword", keywords, Field.Store.YES, Field.Index.TOKENIZED));
        doc.add(new Field("text", text, Field.Store.NO, Field.Index.TOKENIZED));
        doc.add(new Field("url", url, Field.Store.YES, Field.Index.TOKENIZED));
        doc.add(new Field("dependent", dependent, Field.Store.YES, Field.Index.NO));

        /*
         * LUCENE 1.3
         * 
        // Field(fieldName, value, store, index, tokenize)
         * 
        doc.add(new Field("topic",sis,true,true,true)); // special tokenization for subject indicators
        doc.add(new Field("type",type,true,true,false));
        doc.add(new Field("name",name,true,true,true));
        doc.add(new Field("keyword",keywords,true,true,true));
        doc.add(new Field("text",text,false,true,true));
        doc.add(new Field("url",url,true,true,false));
        doc.add(new Field("dependent",dependent,true,false,false));
         * 
         */

        // logger.writelog("INF","Adding document to index {"+sis+"} {"+type+"} {"+name+"} {"+keywords+"} {"+text+"} {"+url+"} {"+dependent+"}");
        return doc;
    }

    public static class NewlineTokenizer extends CharTokenizer {
        public NewlineTokenizer(Reader in) {
            super(in);
        }

        protected boolean isTokenChar(char c) {
            return c != '\n';
        }
    }

    public static class NewlineAnalyzer extends Analyzer {
        public TokenStream tokenStream(String fieldName, Reader reader) {
            return new NewlineTokenizer(reader);
        }
    }

    // -------------------------------------------------------------------------

    public void log(String str) {
        if (logger != null)
            logger.log(str);
        if (toolLogger != null)
            toolLogger.log(str);
        if (logger == null && toolLogger == null) {
            System.out.println(str);
        }
    }

    public void log(Exception e) {
        if (logger != null)
            logger.writelog("ERR", e);
        if (toolLogger != null)
            toolLogger.log(e);
        else
            e.printStackTrace();
    }

    public void log(String str, Exception e) {
        if (logger != null)
            logger.writelog("ERR", str, e);
        if (toolLogger != null)
            toolLogger.log(str, e);
        else {
            System.out.println(str);
            e.printStackTrace();
        }
    }

    public void log(String t, String str, Exception e) {
        if (logger != null)
            logger.writelog(t, str, e);
        if (toolLogger != null)
            toolLogger.log(str, e);
        else {
            System.out.println(str);
            e.printStackTrace();
        }
    }

}