org.pageseeder.flint.lucene.search.Fields.java Source code

Introduction

Here is the source code for org.pageseeder.flint.lucene.search.Fields.java
Source

/*
 * Copyright 2015 Allette Systems (Australia)
 * http://www.allette.com.au
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.pageseeder.flint.lucene.search;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.DataFormatException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.BytesRef;
import org.pageseeder.flint.lucene.util.Beta;
import org.slf4j.LoggerFactory;

/**
 * A set of utility methods for dealing with search fields.
 *
 * @author Christophe Lauret
 * @version 12 August 2010
 */
public final class Fields {

    /** Utility class */
    private Fields() {
    }

    /**
     * Returns a mapping of fields with a default boost value of 1.0.
     *
     * @param fields the list of fields to create the map.
     * @return the corresponding map with each field value mapped to a boost value of 1.0
     */
    @Beta
    public static Map<String, Float> asBoostMap(List<String> fields) {
        Map<String, Float> map = new HashMap<String, Float>();
        for (String f : fields) {
            map.put(f, 1.0f);
        }
        return map;
    }

    /**
     * Indicates whether the given field name is valid.
     *
     * <p>This method does not check for the existence of the field.
     *
     * @param field the name of the field to check.
     * @return <code>true</code> if the field name is a valid name for the index;
     *         <code>false</code> otherwise.
     */
    @Beta
    public static boolean isValidName(String field) {
        return field != null && field.length() > 0;
    }

    /**
     * Returns a list of valid field names.
     *
     * @param fields the list of fields to create the map.
     * @return a list of valid field names.
     */
    @Beta
    public static List<String> filterNames(List<String> fields) {
        List<String> names = new ArrayList<String>();
        for (String f : fields) {
            if (isValidName(f)) {
                names.add(f);
            }
        }
        return names;
    }

    /**
     * Returns a list of possible field values from the specified text.
     *
     * <p>You can use this method to extract the list of terms or phrase values to create a query.
     *
     * <p>Spaces are ignored unless they are within double quotation marks.
     *
     * <p>See examples below:
     * <pre>
     * |Big|             => [Big]
     * |Big bang|        => [Big, bang]
     * |   Big   bang |  => [Big, bang]
     * |The "Big bang"|  => [The, "Big bang"]
     * |The "Big bang|   => [The, "Big, bang]
     * </pre>
     *
     * <p>Note: this class does not excludes terms which could be considered stop words by the index.
     *
     * @param text The text for which values are needed.
     * @return the corresponding list of values.
     */
    @Beta
    public static List<String> toValues(String text) {
        List<String> values = new ArrayList<String>();
        Pattern p = Pattern.compile("(\\\"[^\\\"]+\\\")|(\\S+)");
        Matcher m = p.matcher(text);
        while (m.find()) {
            values.add(m.group());
        }
        return values;
    }

    /**
     * Returns the string value of the specified field.
     *
     * <p>This method will automatically decompress the value of the field if it is binary.
     *
     * @param f The field
     * @return The value of the field as a string.
     */
    public static String toString(IndexableField f) {
        String value = f.stringValue();
        // is it a compressed field?
        if (value == null) {
            BytesRef binary = f.binaryValue();
            if (binary != null)
                try {
                    value = CompressionTools.decompressString(binary);
                } catch (DataFormatException ex) {
                    // strange but true, unable to decompress
                    LoggerFactory.getLogger(Fields.class).error("Failed to decompress field value", ex);
                    return null;
                }
        }
        return value;
    }

    /**
     * Returns the terms for a field
     *
     * @param field    The field
     * @param text     The text to analyze
     * @param analyzer The analyzer
     *
     * @return the corresponding list of terms produced by the analyzer.
     *
     * @throws IOException
     */
    public static List<String> toTerms(String field, String text, Analyzer analyzer) {
        List<String> terms = new ArrayList<String>();
        try {
            TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
            CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                String term = attribute.toString();
                terms.add(term);
            }
            stream.end();
            stream.close();
        } catch (IOException ex) {
            // Should not occur since we use a StringReader
            ex.printStackTrace();
        }
        return terms;
    }

}