org.lexevs.dao.index.metadata.BaseMetaDataLoader.java Source code

Introduction

Here is the source code for org.lexevs.dao.index.metadata.BaseMetaDataLoader.java
Source

/*
 * Copyright: (c) 2004-2010 Mayo Foundation for Medical Education and 
 * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, or as used to identify 
 * MFMER as the author of this software, the trade names, trademarks, service
 * marks, or product names of the copyright holder shall not be used in
 * advertising, promotion or otherwise in connection with this software without
 * prior written authorization of the copyright holder.
 * 
 * Licensed under the Eclipse Public License, Version 1.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 * 
 *       http://www.eclipse.org/legal/epl-v10.html
 * 
 */
package org.lexevs.dao.index.metadata;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import org.LexGrid.LexBIG.Utility.logging.LgLoggerIF;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.AttributeFactory;
import org.lexevs.dao.indexer.api.generators.DocumentFromStringsGenerator;
import org.lexevs.logging.LoggerFactory;

/**
 * Base class for building a metadata index for LexFoo metadata.
 * 
 * @author <A HREF="mailto:armbrust.daniel@mayo.edu">Dan Armbrust</A>
 * @author <A HREF="mailto:erdmann.jesse@mayo.edu">Jesse Erdmann</A>
 * @version subversion $Revision: $ checked in on $Date: $
 */
public class BaseMetaDataLoader {

    private DocumentFromStringsGenerator generator_ = new DocumentFromStringsGenerator();
    protected static boolean normEnabled_ = false;
    protected static boolean doubleMetaphoneEnabled_ = true;
    protected static boolean stemmingEnabled_ = true;
    private static final String normPrefix_ = "norm_";
    private static final String doubleMetaphonePrefix_ = "dm_";
    private static final String stemmingPrefix_ = "stem_";

    public static final String STRING_TOKENIZER_TOKEN = "<:>";
    public static final String CONCATINATED_VALUE_SPLIT_TOKEN = ":";

    protected LgLoggerIF getLogger() {
        return LoggerFactory.getLogger();
    }

    public Document addProperty(String codingSchemeUri, String codingSchemeVersion,
            List<String> propertyValueParents, String propertyName, String propertyValue) throws Exception {
        StringBuffer fields = new StringBuffer();
        generator_.startNewDocument(codingSchemeUri + CONCATINATED_VALUE_SPLIT_TOKEN + codingSchemeVersion
                + CONCATINATED_VALUE_SPLIT_TOKEN + UUID.randomUUID().toString());
        generator_.addTextField("codingSchemeRegisteredName", codingSchemeUri, true, true, false);
        fields.append("codingSchemeRegisteredName ");
        generator_.addTextField("codingSchemeVersion", codingSchemeVersion, true, true, false);
        fields.append("codingSchemeVersion ");

        // this field is used to make deletions easier.
        generator_.addTextField("codingSchemeNameVersion",
                codingSchemeUri + CONCATINATED_VALUE_SPLIT_TOKEN + codingSchemeVersion, false, true, false);
        fields.append("codingSchemeNameVersion ");

        if (propertyValueParents != null && propertyValueParents.size() > 0) {
            StringBuffer temp = new StringBuffer();
            for (int i = 0; i < propertyValueParents.size(); i++) {
                temp.append(propertyValueParents.get(i));
                if (i + 1 < propertyValueParents.size()) {
                    temp.append(STRING_TOKENIZER_TOKEN);
                }
            }
            generator_.addTextField("parentContainers", temp.toString(), true, true, true);
            fields.append("parentContainers ");
        }

        if (propertyName != null && propertyName.length() > 0) {
            generator_.addTextField("propertyName", propertyName, true, true, false);
            fields.append("propertyName ");
        }

        if (propertyValue != null && propertyValue.length() > 0) {
            generator_.addTextField("propertyValue", propertyValue, true, true, true);
            fields.append("propertyValue ");

            // This copy of the content is required for making "startsWith" or
            // "exactMatch" types of queries
            generator_.addTextField("untokenizedLCPropertyValue", propertyValue.toLowerCase(), false, true, false);
            fields.append("untokenizedLCPropertyValue ");

            if (normEnabled_) {
                generator_.addTextField(normPrefix_ + "propertyValue", propertyValue, false, true, true);
                fields.append(normPrefix_ + "propertyValue ");
            }

            if (doubleMetaphoneEnabled_) {
                generator_.addTextField(doubleMetaphonePrefix_ + "propertyValue", propertyValue, false, true, true);
                fields.append(doubleMetaphonePrefix_ + "propertyValue ");
            }

            if (stemmingEnabled_) {
                generator_.addTextField(stemmingPrefix_ + "propertyValue", propertyValue, false, true, true);
                fields.append(stemmingPrefix_ + "propertyValue ");
            }
        }

        generator_.addTextField("fields", fields.toString(), true, true, true);

        return generator_.getDocument();
    }

    public static Analyzer getMetadataAnalyzer() {
        Map<String, Analyzer> analyzerPerField = new HashMap<>();

        if (doubleMetaphoneEnabled_) {
            Analyzer temp = new Analyzer() {

                @Override
                protected TokenStreamComponents createComponents(String fieldName) {
                    final StandardTokenizer source = new StandardTokenizer(
                            AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                    source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                    TokenStream filter = new StandardFilter(source);
                    filter = new LowerCaseFilter(filter);
                    filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                    filter = new DoubleMetaphoneFilter(filter, 4, true);
                    return new TokenStreamComponents(source, filter);
                }
            };
            analyzerPerField.put(doubleMetaphonePrefix_ + "propertyValue", temp);
        }

        if (normEnabled_) {
            try {
                Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET);
                analyzerPerField.put(normPrefix_ + "propertyValue", temp);
            } catch (NoClassDefFoundError e) {
                // norm is not available
                normEnabled_ = false;
            }
        }

        if (stemmingEnabled_) {
            Analyzer temp = new Analyzer() {

                @Override
                protected TokenStreamComponents createComponents(String fieldName) {
                    final StandardTokenizer source = new StandardTokenizer(
                            AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                    source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                    TokenStream filter = new StandardFilter(source);
                    filter = new LowerCaseFilter(filter);
                    filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                    filter = new SnowballFilter(filter, "English");
                    return new TokenStreamComponents(source, filter);
                }
            };
            analyzerPerField.put(stemmingPrefix_ + "propertyValue", temp);
        }

        // these fields just get simple analyzing.
        List<String> dividerList = new ArrayList<String>();
        dividerList.add(STRING_TOKENIZER_TOKEN);
        Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true));
        analyzerPerField.put("parentContainers", sa);

        // no stop words, default character removal set.
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET),
                analyzerPerField);

        return analyzer;
    }
}