org.exist.indexing.lucene.PlugToLucene.java Source code

Introduction

Here is the source code for org.exist.indexing.lucene.PlugToLucene.java
Source

/*
 *  eXist Open Source Native XML Database
 *  Copyright (C) 2013 The eXist Project
 *  http://exist-db.org
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 *  $Id$
 */
package org.exist.indexing.lucene;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.Version;
import org.exist.EXistException;
import org.exist.dom.DocumentImpl;
import org.exist.indexing.lucene.PlainTextHighlighter.Offset;
import org.exist.memtree.MemTreeBuilder;
import org.exist.memtree.NodeImpl;
import org.exist.security.Subject;
import org.exist.storage.BrokerPool;
import org.exist.storage.DBBroker;
import org.exist.storage.md.Meta;
import org.exist.storage.md.MetaDataImpl;
import org.exist.storage.md.Metas;
import org.exist.xquery.XPathException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * @author <a href="mailto:shabanovd@gmail.com">Dmitriy Shabanov</a>
 *
 */
public class PlugToLucene {

    public static final String FIELD_META_DOC_URI = "metaDocUri";

    MetaDataImpl metadata;

    LuceneIndex index;
    LuceneIndexWorker worker;

    public PlugToLucene(MetaDataImpl metadata) {
        this.metadata = metadata;

        DBBroker broker = getBroker();

        worker = (LuceneIndexWorker) broker.getIndexController().getWorkerByIndexId(LuceneIndex.ID);

        try {
            java.lang.reflect.Field field = worker.getClass().getDeclaredField("index");
            field.setAccessible(true);
            index = (LuceneIndex) field.get(worker);

        } catch (Exception e) {
            throw new RuntimeException("Can't get LuceneIndex", e);
        }
    }

    private DBBroker getBroker() {
        BrokerPool db;
        try {
            db = BrokerPool.getInstance();
        } catch (Exception e) {
            throw new RuntimeException("Can't get BrokerPool", e);
        }

        return db.getActiveBroker();
    }

    private DocumentImpl getDocument(Metas metas) {
        //object
        String uuid = metas.getUUID();

        DBBroker broker = getBroker();
        Subject currentSubject = broker.getSubject();
        try {
            broker.setSubject(broker.getDatabase().getSecurityManager().getSystemSubject());
            return metadata.getDocument(uuid);
        } catch (Exception e) {
            throw new RuntimeException("Document '" + uuid + "' not found.", e);
        } finally {
            broker.setSubject(currentSubject);
        }
    }

    public void addMetas(Metas metas) {

        //update lucene record

        DocumentImpl doc = getDocument(metas);

        //make sure that index worker do not process different document
        DocumentImpl indexDoc = worker.getDocument();
        if (indexDoc != null && !checkPendingDoc()) {
            throw new RuntimeException("Index processing different document '" + indexDoc.getFileURI() + "' ['"
                    + doc.getFileURI() + "].");
        }

        // Note: code order is important here,
        //worker.setDocument(doc, StreamListener.STORE);
        //worker.setMode(StreamListener.STORE);

        indexMetas(doc, metas);

        //write
        //worker.writeNonXML();
    }

    private void indexMetas(DocumentImpl doc, Metas metas) {

        // create Lucene document
        Document pendingDoc = new Document();

        // Set DocId
        NumericField fDocId = new NumericField(LuceneIndexWorker.FIELD_DOC_ID, Field.Store.YES, true);
        fDocId.setIntValue(doc.getDocId());
        pendingDoc.add(fDocId);

        // For binary documents the doc path needs to be stored
        String uri = metas.getURI();
        Field fDocUri = new Field(FIELD_META_DOC_URI, uri, Field.Store.YES, Field.Index.NOT_ANALYZED);
        pendingDoc.add(fDocUri);

        StringBuilder sb = new StringBuilder();

        // Iterate over all found fields and write the data.
        for (Meta meta : metas.metas()) {
            Object value = meta.getValue();
            if (!(value instanceof String)) {
                //ignore non string values
                continue;
            }

            // Get field type configuration
            //            FieldType fieldType = config == null ? null : config.getFieldType(field.getName());
            //            
            Field.Store store = null;
            //            if (fieldType != null)
            //                store = fieldType.getStore();
            //            if (store == null)
            store = Field.Store.YES;//field.getStore();

            // Get name from SOLR field
            String contentFieldName = meta.getKey();

            //Analyzer fieldAnalyzer = (fieldType == null) ? null : fieldType.getAnalyzer();

            // Extract (document) Boost factor
            //            if (field.getBoost() > 0) {
            //                pendingDoc.setBoost(field.getBoost());
            //            } 

            // Actual field content ; Store flag can be set in solrField
            Field contentField = new Field(contentFieldName, value.toString(), store, Field.Index.ANALYZED,
                    Field.TermVector.YES);

            // Set boost value from SOLR config
            //contentField.setBoost(field.getBoost());

            pendingDoc.add(contentField);

            sb.append(value.toString()).append(" ");
        }

        Field contentField = new Field("ALL_METAS", sb.toString(), Field.Store.NO, Field.Index.ANALYZED,
                Field.TermVector.YES);

        // Set boost value from SOLR config
        //contentField.setBoost(field.getBoost());

        pendingDoc.add(contentField);

        IndexWriter writer = null;
        try {
            writer = index.getWriter();

            // by default, Lucene only indexes the first 10,000 terms in a field
            writer.setMaxFieldLength(Integer.MAX_VALUE);

            writer.addDocument(pendingDoc);
        } catch (IOException e) {
            //LOG.warn("An exception was caught while indexing document: " + e.getMessage(), e);

        } finally {
            index.releaseWriter(writer);
        }
    }

    public void removeMetas(Metas metas) {

        DocumentImpl doc = getDocument(metas);

        //make sure that index worker do not process different document
        DocumentImpl indexDoc = worker.getDocument();
        if (indexDoc != null && !checkPendingDoc()) {
            throw new RuntimeException("Index processing different document '" + indexDoc.getFileURI() + "' ['"
                    + doc.getFileURI() + "].");
        }

        // Note: code order is important here,
        //worker.setDocument(doc, StreamListener.STORE);
        //worker.setMode(StreamListener.STORE);

        removeMetas(doc, metas);

        //write
        //worker.writeNonXML();
    }

    private void removeMetas(DocumentImpl doc, Metas metas) {

        //update lucene record

        IndexWriter writer = null;
        try {
            writer = index.getWriter();
            String uri = metas.getURI();
            Term dt = new Term(FIELD_META_DOC_URI, uri);
            writer.deleteDocuments(dt);
        } catch (IOException e) {
            //LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
        }
    }

    private boolean checkPendingDoc() {
        try {
            java.lang.reflect.Field field = worker.getClass().getDeclaredField("pendingDoc");
            field.setAccessible(true);
            return (field.get(worker) == null);

        } catch (Exception e) {
        }

        return false;
    }

    public NodeImpl search(String queryText, List<String> toBeMatchedURIs) throws XPathException {
        BrokerPool db = null;
        DBBroker broker = null;
        try {
            db = BrokerPool.getInstance();
            broker = db.get(null);

            Subject currentSubject = broker.getSubject();
            try {

                broker.setSubject(db.getSecurityManager().getSystemSubject());

                //                LuceneIndexWorker index = (LuceneIndexWorker) broker
                //                        .getIndexController().getWorkerByIndexId(LuceneIndex.ID);

                return search(toBeMatchedURIs, queryText);

            } finally {
                broker.setSubject(currentSubject);
            }

        } catch (EXistException e) {
            throw new XPathException(e);
        } finally {
            if (db != null)
                db.release(broker);
        }
    }

    private NodeImpl search(List<String> toBeMatchedURIs, String queryText) throws XPathException {

        NodeImpl report = null;

        IndexSearcher searcher = null;
        try {
            // Get index searcher
            searcher = index.getSearcher();

            // Get analyzer : to be retrieved from configuration
            Analyzer searchAnalyzer = new StandardAnalyzer(Version.LUCENE_29);

            // Setup query Version, default field, analyzer
            QueryParser parser = new QueryParser(Version.LUCENE_29, "", searchAnalyzer);
            Query query = parser.parse(queryText);

            // extract all used fields from query
            String[] fields = LuceneUtil.extractFields(query, searcher.getIndexReader());

            // Setup collector for results
            LuceneHitCollector collector = new LuceneHitCollector();

            // Perform actual search
            searcher.search(query, collector);

            // Retrieve all documents that match the query
            List<ScoreDoc> results = collector.getDocsByScore();

            // reusable attributes
            AttributesImpl attribs = null;

            PlainTextHighlighter highlighter = new PlainTextHighlighter(query, searcher.getIndexReader());

            MemTreeBuilder builder = new MemTreeBuilder();
            builder.startDocument();

            // start root element
            int nodeNr = builder.startElement("", "results", "results", null);

            BitVector processed = new BitVector(searcher.maxDoc());
            // Process result documents
            for (ScoreDoc scoreDoc : results) {
                if (processed.get(scoreDoc.doc))
                    continue;
                processed.set(scoreDoc.doc);

                Document doc = searcher.doc(scoreDoc.doc);

                // Get URI field of document                
                String fDocUri = doc.get(FIELD_META_DOC_URI);

                // Get score
                float score = scoreDoc.score;

                // Check if document URI has a full match or if a
                // document is in a collection
                if (isDocumentMatch(fDocUri, toBeMatchedURIs)) {

                    // setup attributes
                    attribs = new AttributesImpl();
                    attribs.addAttribute("", "uri", "uri", "CDATA", fDocUri);
                    attribs.addAttribute("", "score", "score", "CDATA", "" + score);

                    // write element and attributes
                    builder.startElement("", "search", "search", attribs);
                    for (String field : fields) {
                        String[] fieldContent = doc.getValues(field);
                        attribs.clear();
                        attribs.addAttribute("", "name", "name", "CDATA", field);
                        for (String content : fieldContent) {
                            List<Offset> offsets = highlighter.getOffsets(content, searchAnalyzer);
                            if (offsets != null) {
                                builder.startElement("", "field", "field", attribs);
                                highlighter.highlight(content, offsets, builder);
                                builder.endElement();
                            }
                        }
                    }
                    builder.endElement();

                    // clean attributes
                    attribs.clear();
                }
            }

            // finish root element
            builder.endElement();

            //System.out.println(builder.getDocument().toString());

            // TODO check
            report = ((org.exist.memtree.DocumentImpl) builder.getDocument()).getNode(nodeNr);

        } catch (Exception ex) {
            ex.printStackTrace();
            //LOG.error(ex);
            throw new XPathException(ex);

        } finally {
            index.releaseSearcher(searcher);
        }

        return report;
    }

    public List<String> searchDocuments(String queryText, List<String> toBeMatchedURIs) throws XPathException {

        List<String> uris = new ArrayList<String>();

        IndexSearcher searcher = null;
        try {
            // Get index searcher
            searcher = index.getSearcher();

            // Get analyzer : to be retrieved from configuration
            Analyzer searchAnalyzer = new StandardAnalyzer(Version.LUCENE_29);

            // Setup query Version, default field, analyzer
            QueryParser parser = new QueryParser(Version.LUCENE_29, "", searchAnalyzer);
            Query query = parser.parse(queryText);

            // Setup collector for results
            LuceneHitCollector collector = new LuceneHitCollector();

            // Perform actual search
            searcher.search(query, collector);

            // Retrieve all documents that match the query
            List<ScoreDoc> results = collector.getDocsByScore();

            BitVector processed = new BitVector(searcher.maxDoc());
            // Process result documents
            for (ScoreDoc scoreDoc : results) {
                if (processed.get(scoreDoc.doc))
                    continue;
                processed.set(scoreDoc.doc);

                Document doc = searcher.doc(scoreDoc.doc);

                // Get URI field of document                
                String fDocUri = doc.get(FIELD_META_DOC_URI);

                // Get score
                float score = scoreDoc.score;

                // Check if document URI has a full match or if a
                // document is in a collection
                if (isDocumentMatch(fDocUri, toBeMatchedURIs)) {
                    uris.add(fDocUri);
                }
            }

        } catch (Exception ex) {
            ex.printStackTrace();
            //LOG.error(ex);
            throw new XPathException(ex);

        } finally {
            index.releaseSearcher(searcher);
        }

        return uris;
    }

    private boolean isDocumentMatch(String docUri, List<String> toBeMatchedUris) {

        if (docUri == null) {
            //LOG.error("docUri is null.");
            return false;
        }

        if (toBeMatchedUris == null) {
            //LOG.error("match is null.");
            return false;
        }

        for (String doc : toBeMatchedUris) {
            if (docUri.startsWith(doc)) {
                return true;
            }
        }
        return false;
    }

    private static class LuceneHitCollector extends Collector {

        private List<ScoreDoc> docs = new ArrayList<ScoreDoc>();
        private int docBase;
        private Scorer scorer;

        private LuceneHitCollector() {
            //Nothing special to do
        }

        public List<ScoreDoc> getDocs() {
            Collections.sort(docs, new Comparator<ScoreDoc>() {

                public int compare(ScoreDoc scoreDoc, ScoreDoc scoreDoc1) {
                    if (scoreDoc.doc == scoreDoc1.doc)
                        return 0;
                    else if (scoreDoc.doc < scoreDoc1.doc)
                        return -1;
                    return 1;
                }
            });
            return docs;
        }

        /**
         * Get matching lucene documents by descending score
         * @return
         */
        public List<ScoreDoc> getDocsByScore() {
            Collections.sort(docs, new Comparator<ScoreDoc>() {

                public int compare(ScoreDoc scoreDoc, ScoreDoc scoreDoc1) {
                    if (scoreDoc.score == scoreDoc1.score)
                        return 0;
                    else if (scoreDoc.score < scoreDoc1.score)
                        return 1;
                    return -1;
                }
            });
            return docs;
        }

        @Override
        public void setScorer(Scorer scorer) throws IOException {
            this.scorer = scorer;
        }

        @Override
        public void setNextReader(IndexReader indexReader, int docBase) throws IOException {
            this.docBase = docBase;
        }

        @Override
        public boolean acceptsDocsOutOfOrder() {
            return false;
        }

        @Override
        public void collect(int doc) {
            try {
                float score = scorer.score();
                docs.add(new ScoreDoc(doc + docBase, score));
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}