org.cee.store.lucene.LuceneArticleStore.java Source code

Java tutorial

Introduction

Here is the source code for org.cee.store.lucene.LuceneArticleStore.java

Source

package org.cee.store.lucene;

/*
 * #%L
 * Content Extraction Engine - News Store Lucene
 * %%
 * Copyright (C) 2013 Andreas Behnke
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.cee.search.ArticleSearchService;
import org.cee.search.SearchException;
import org.cee.store.EntityKey;
import org.cee.store.StoreException;
import org.cee.store.article.Article;
import org.cee.store.article.ArticleChangeListener;
import org.cee.store.article.ArticleChangeListenerSupport;
import org.cee.store.article.ArticleKey;
import org.cee.store.article.ArticleStore;
import org.cee.store.article.TextBlock;
import org.cee.store.workingset.WorkingSet;

public class LuceneArticleStore extends LuceneStoreBase implements ArticleStore, ArticleSearchService {

    private interface ArticleKeyFilter {

        boolean accept(String key, ScoreDoc scoreDoc);

    }

    private ArticleChangeListenerSupport listenerSupport = new ArticleChangeListenerSupport();

    private Query createArticleQuery(ArticleKey articleKey) {
        BooleanQuery query = new BooleanQuery();
        query.add(new BooleanClause(
                new TermQuery(new Term(LuceneConstants.FIELD_ARTICLE_EXTERNAL_ID, articleKey.getKey())),
                Occur.MUST));
        query.add(new BooleanClause(
                new TermQuery(new Term(LuceneConstants.FIELD_ARTICLE_SITE, articleKey.getSiteKey())), Occur.MUST));
        return query;
    }

    private Query createLanguageQuery(String language) {
        return new TermQuery(new Term(LuceneConstants.FIELD_ARTICLE_LANGUAGE, language));
    }

    private Query createQueryArticlesOfSites(List<EntityKey> sites) {
        if (sites.size() == 1) {
            return new TermQuery(new Term(LuceneConstants.FIELD_ARTICLE_SITE, sites.get(0).getKey()));
        } else {
            BooleanQuery query = new BooleanQuery();
            query.setMinimumNumberShouldMatch(1);
            for (EntityKey site : sites) {
                query.add(new TermQuery(new Term(LuceneConstants.FIELD_ARTICLE_SITE, site.getKey())),
                        BooleanClause.Occur.SHOULD);
            }
            return query;
        }
    }

    private Query createFindArticlesQuery(List<EntityKey> sites, String fulltextSearchQuery, String language)
            throws org.apache.lucene.queryparser.classic.ParseException {
        Analyzer analyzer = getAnalyzer(language);

        MultiFieldQueryParser parser = new MultiFieldQueryParser(LuceneConstants.VERSION,
                LuceneConstants.ARTICLE_FULLTEXT_SEARCH_FIELDS, analyzer,
                LuceneConstants.ARTICLE_FULLTEXT_SEARCH_BOOSTS);
        Query fulltextQuery = parser.parse(QueryParser.escape(fulltextSearchQuery));
        Query sitesQuery = createQueryArticlesOfSites(sites);
        Query languageQuery = createLanguageQuery(language);

        BooleanQuery query = new BooleanQuery();
        query.add(sitesQuery, Occur.MUST);
        query.add(languageQuery, Occur.MUST);
        query.add(fulltextQuery, Occur.MUST);
        return query;
    }

    private Query boostRelatedQuery(Query relatedQuery) {
        List<BooleanClause> clauses = ((BooleanQuery) relatedQuery).clauses();
        for (BooleanClause booleanClause : clauses) {
            TermQuery tq = (TermQuery) booleanClause.getQuery();
            Float fieldBoost = LuceneConstants.ARTICLE_FULLTEXT_SEARCH_BOOSTS.get(tq.getTerm().field());
            if (fieldBoost != null) {
                tq.setBoost(fieldBoost * tq.getBoost());
            }
        }
        return relatedQuery;
    }

    private Query createRelatedArticlesQuery(List<EntityKey> sites, ArticleKey reference, IndexSearcher searcher,
            String language) throws IOException {
        Query articleQuery = createArticleQuery(reference);
        TopDocs topDocs = searcher.search(articleQuery, 1);
        if (topDocs.totalHits == 0) {
            return new BooleanQuery(true);
        }
        MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
        mlt.setFieldNames(LuceneConstants.ARTICLE_RELATED_SEARCH_FIELDS);
        mlt.setMaxQueryTerms(20);
        mlt.setBoost(true);
        mlt.setMinTermFreq(0);
        mlt.setMinDocFreq(0);
        Query relatedQuery = boostRelatedQuery(mlt.like(topDocs.scoreDocs[0].doc));

        BooleanQuery query = new BooleanQuery();
        query.add(new BooleanClause(relatedQuery, Occur.MUST));
        query.add(new BooleanClause(createQueryArticlesOfSites(sites), Occur.MUST));
        return query;
    }

    private List<ArticleKey> getKeys(Query query, IndexSearcher searcher, Sort sort, ArticleKeyFilter filter)
            throws IOException {
        List<ArticleKey> entityKeys = new ArrayList<ArticleKey>();
        TopDocs topDocs = null;
        if (sort != null) {
            topDocs = searcher.search(query, LuceneConstants.MAX_RESULT_SIZE, sort);
        } else {
            topDocs = searcher.search(query, LuceneConstants.MAX_RESULT_SIZE);
        }
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            Document doc = searcher.doc(scoreDoc.doc);
            String key = doc.get(LuceneConstants.FIELD_ARTICLE_EXTERNAL_ID);
            if (filter == null || filter.accept(key, scoreDoc)) {
                String name = doc.get(LuceneConstants.FIELD_ARTICLE_TITLE);
                String siteName = doc.get(LuceneConstants.FIELD_ARTICLE_SITE);
                entityKeys.add(ArticleKey.get(name, key, siteName, scoreDoc.score));
            }
        }
        return entityKeys;
    }

    private List<ArticleKey> getKeys(Query query, Sort sort) throws IOException {
        IndexSearcher searcher = aquireSearcher();
        try {
            return getKeys(query, searcher, sort, null);
        } finally {
            releaseSearcher(searcher);
        }
    }

    private List<ArticleKey> getKeys(Query query, IndexSearcher searcher, ArticleKeyFilter filter)
            throws IOException {
        return getKeys(query, searcher, null, filter);
    }

    private List<ArticleKey> getKeys(Query query) throws IOException {
        return getKeys(query, (Sort) null);
    }

    private Document getArticleDocument(ArticleKey articleKey, IndexSearcher searcher) throws IOException {
        return getSingleDocument(searcher, createArticleQuery(articleKey));
    }

    private Document getArticleDocument(ArticleKey articleKey) throws IOException {
        IndexSearcher searcher = aquireSearcher();
        try {
            return getArticleDocument(articleKey, searcher);
        } finally {
            releaseSearcher(searcher);
        }
    }

    private Document createArticleDocument(EntityKey site, Article article) {
        DocumentBuilder builder = new DocumentBuilder()
                .addStringField(LuceneConstants.FIELD_ARTICLE_EXTERNAL_ID, article.getExternalId(), Field.Store.YES)
                .addStringField(LuceneConstants.FIELD_ARTICLE_SITE, site.getKey(), Field.Store.YES)
                .addStringField(LuceneConstants.FIELD_ARTICLE_LANGUAGE, article.getLanguage(), Field.Store.YES)
                .addStringField(LuceneConstants.FIELD_ARTICLE_LOCATION, article.getLocation(), Field.Store.YES)
                .addTextFieldWithTermVectors(LuceneConstants.FIELD_ARTICLE_TITLE, article.getTitle(),
                        Field.Store.YES)
                .addTextFieldWithTermVectors(LuceneConstants.FIELD_ARTICLE_SHORT_TEXT, article.getShortText(),
                        Field.Store.YES)
                .addDateField(LuceneConstants.FIELD_ARTICLE_PUBLISHED_DATE, article.getPublishedDate(),
                        Field.Store.YES);
        for (TextBlock block : article.getContent()) {
            builder.addTextFieldWithTermVectors(LuceneConstants.FIELD_ARTICLE_CONTENT, block.getContent(),
                    Field.Store.YES);
        }
        return builder.getDocument();
    }

    private Article createArticleFromDocument(Document articleDocument, boolean withContext) throws ParseException {
        if (articleDocument == null) {
            return null;
        }
        Article article = new Article();
        article.setExternalId(getStringFieldOrNull(articleDocument, LuceneConstants.FIELD_ARTICLE_EXTERNAL_ID));
        article.setLanguage(getStringFieldOrNull(articleDocument, LuceneConstants.FIELD_ARTICLE_LANGUAGE));
        article.setLocation(getStringFieldOrNull(articleDocument, LuceneConstants.FIELD_ARTICLE_LOCATION));
        article.setTitle(getStringFieldOrNull(articleDocument, LuceneConstants.FIELD_ARTICLE_TITLE));
        article.setShortText(getStringFieldOrNull(articleDocument, LuceneConstants.FIELD_ARTICLE_SHORT_TEXT));
        Calendar published = Calendar.getInstance();
        published.setTime(DateTools
                .stringToDate(getStringFieldOrNull(articleDocument, LuceneConstants.FIELD_ARTICLE_PUBLISHED_DATE)));
        article.setPublishedDate(published);
        if (withContext) {
            IndexableField[] fields = articleDocument.getFields(LuceneConstants.FIELD_ARTICLE_CONTENT);
            List<TextBlock> blocks = new ArrayList<TextBlock>();
            for (IndexableField field : fields) {
                blocks.add(new TextBlock(field.stringValue()));
            }
            article.setContent(blocks);
        }
        return article;
    }

    public LuceneArticleStore() {
    }

    public LuceneArticleStore(IndexWriter indexWriter, LuceneAnalyzers analyzers) {
        setIndexWriter(indexWriter);
        setAnalyzers(analyzers);
    }

    @Override
    public ArticleKey update(EntityKey site, Article article) throws StoreException {
        try {
            String siteKey = site.getName();
            ArticleKey articleKey = ArticleKey.get(article.getTitle(), article.getExternalId(), siteKey);
            deleteDocuments(createArticleQuery(articleKey));
            addDocument(createArticleDocument(site, article), article.getLanguage());
            commit();
            // we can not determine if we have updated or created a new article. Always fire article created event.
            listenerSupport.fireArticleChanged(site, article);
            return articleKey;
        } catch (IOException ioe) {
            throw new StoreException(site, ioe);
        }
    }

    @Override
    public boolean contains(EntityKey site, String externalId) throws StoreException {
        ArticleKey articleKey = ArticleKey.get(null, externalId, site.getKey());
        try {
            return containsDocument(createArticleQuery(articleKey));
        } catch (IOException e) {
            throw new StoreException(null, e);
        }
    }

    @Override
    public List<ArticleKey> addNewArticles(EntityKey siteKey, List<Article> articles) throws StoreException {
        try {
            IndexSearcher searcher = aquireSearcher();
            List<ArticleKey> articleKeys = new ArrayList<ArticleKey>();
            try {
                for (Article article : articles) {
                    ArticleKey articleKey = ArticleKey.get(article.getTitle(), article.getExternalId(),
                            siteKey.getKey());
                    if (!containsDocument(createArticleQuery(articleKey), searcher)) {
                        addDocument(createArticleDocument(siteKey, article), article.getLanguage());
                        articleKeys.add(articleKey);
                        listenerSupport.fireArticleChanged(siteKey, article);
                    }
                }
                return articleKeys;
            } finally {
                releaseSearcher(searcher);
                commit();
            }
        } catch (IOException ioe) {
            throw new StoreException(null, ioe);
        }
    }

    @Override
    public void addArticleChangeListener(ArticleChangeListener listener) {
        listenerSupport.addArticleChangeListener(listener);
    }

    @Override
    public Article getArticle(ArticleKey key, boolean withContent) throws StoreException {
        try {
            return createArticleFromDocument(getArticleDocument(key), withContent);
        } catch (IOException ioe) {
            throw new StoreException(null, ioe);
        } catch (ParseException pe) {
            throw new StoreException(null, pe);
        }
    }

    @Override
    public List<Article> getArticles(List<ArticleKey> keys, boolean withContent) throws StoreException {
        try {
            List<Article> articles = new ArrayList<Article>();
            IndexSearcher searcher = aquireSearcher();
            try {
                for (ArticleKey articleKey : keys) {
                    Document articleDocument = getArticleDocument(articleKey);
                    articles.add(createArticleFromDocument(articleDocument, withContent));
                }
                if (articles.size() != keys.size()) {
                    throw new StoreException("EntityKey list and result list have different size");
                }
                return articles;
            } finally {
                releaseSearcher(searcher);
            }
        } catch (IOException ioe) {
            throw new StoreException(null, ioe);
        } catch (ParseException pe) {
            throw new StoreException(null, pe);
        }
    }

    @Override
    public List<ArticleKey> getArticlesOrderedByDate(EntityKey siteKey) throws StoreException {
        try {
            ArrayList<EntityKey> sites = new ArrayList<EntityKey>();
            sites.add(siteKey);
            return getKeys(createQueryArticlesOfSites(sites), LuceneConstants.ARTICLE_PUBLISHED_SORT);
        } catch (IOException ioe) {
            throw new StoreException(null, ioe);
        }
    }

    @Override
    public List<ArticleKey> getArticlesOrderedByDate(List<EntityKey> siteKeys) throws StoreException {
        try {
            return getKeys(createQueryArticlesOfSites(siteKeys), LuceneConstants.ARTICLE_PUBLISHED_SORT);
        } catch (IOException ioe) {
            throw new StoreException(null, ioe);
        }
    }

    @Override
    public List<ArticleKey> getArticlesOrderedByDate(WorkingSet workingSet) throws StoreException {
        try {
            return getKeys(createQueryArticlesOfSites(workingSet.getSites()),
                    LuceneConstants.ARTICLE_PUBLISHED_SORT);
        } catch (IOException ioe) {
            throw new StoreException(null, ioe);
        }
    }

    @Override
    public List<String> getSupportedLanguages() {
        return getAnalyzers().getSupportedLanguages();
    }

    @Override
    public List<ArticleKey> findArticles(List<EntityKey> sites, String fulltextSearchQuery, String language)
            throws SearchException {
        try {
            if (fulltextSearchQuery.trim().length() == 0) {
                return new ArrayList<ArticleKey>();
            }
            return getKeys(createFindArticlesQuery(sites, fulltextSearchQuery, language));
        } catch (IOException ioe) {
            throw new SearchException("Could not search for \"" + fulltextSearchQuery + "\"", ioe);
        } catch (org.apache.lucene.queryparser.classic.ParseException pe) {
            throw new SearchException("Could not parse query \"" + fulltextSearchQuery + "\"", pe);
        }
    }

    @Override
    public List<ArticleKey> findRelatedArticles(List<EntityKey> sites, ArticleKey articleKey, String language)
            throws SearchException {
        try {
            IndexSearcher searcher = aquireSearcher();
            try {
                final String relatedKey = articleKey.getKey();
                BooleanQuery query = (BooleanQuery) createRelatedArticlesQuery(sites, articleKey, searcher,
                        language);

                return getKeys(query, searcher, new ArticleKeyFilter() {

                    boolean isFirst = true;

                    boolean rejectAll = false;

                    float minScore;

                    @Override
                    public boolean accept(String key, ScoreDoc scoreDoc) {
                        if (rejectAll) {
                            return false;
                        }
                        if (relatedKey.equals(key)) {
                            return false;
                        }
                        if (isFirst) {
                            isFirst = false;
                            if (scoreDoc.score < 0.36f) {
                                rejectAll = true;
                                return false;
                            }
                            //calculate min score
                            minScore = scoreDoc.score / 3.3f;
                            return true;
                        }
                        if (scoreDoc.score < minScore) {
                            return false;
                        }
                        return true;
                    }
                });
            } finally {
                releaseSearcher(searcher);
            }
        } catch (IOException ioe) {
            throw new SearchException("Could not find related article", ioe);
        }
    }
}