org.apache.lucene.analysis.synonym.SolrSynonymParser.java Source code

Introduction

Here is the source code for org.apache.lucene.analysis.synonym.SolrSynonymParser.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.synonym;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;

/**
 * Parser for the Solr synonyms format.
 * <ol>
 *   <li> Blank lines and lines starting with '#' are comments.
 *   <li> Explicit mappings match any token sequence on the LHS of "=&gt;"
 *        and replace with all alternatives on the RHS.  These types of mappings
 *        ignore the expand parameter in the constructor.
 *        Example:
 *        <blockquote>i-pod, i pod =&gt; ipod</blockquote>
 *   <li> Equivalent synonyms may be separated with commas and give
 *        no explicit mapping.  In this case the mapping behavior will
 *        be taken from the expand parameter in the constructor.  This allows
 *        the same synonym file to be used in different synonym handling strategies.
 *        Example:
 *        <blockquote>ipod, i-pod, i pod</blockquote>
 * 
 *   <li> Multiple synonym mapping entries are merged.
 *        Example:
 *        <blockquote>
 *         foo =&gt; foo bar<br>
 *         foo =&gt; baz<br><br>
 *         is equivalent to<br><br>
 *         foo =&gt; foo bar, baz
 *        </blockquote>
 *  </ol>
 * @lucene.experimental
 */
public class SolrSynonymParser extends SynonymMap.Parser {
    private final boolean expand;

    public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
        super(dedup, analyzer);
        this.expand = expand;
    }

    @Override
    public void parse(Reader in) throws IOException, ParseException {
        LineNumberReader br = new LineNumberReader(in);
        try {
            addInternal(br);
        } catch (IllegalArgumentException e) {
            ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
            ex.initCause(e);
            throw ex;
        } finally {
            br.close();
        }
    }

    private void addInternal(BufferedReader in) throws IOException {
        String line = null;
        while ((line = in.readLine()) != null) {
            if (line.length() == 0 || line.charAt(0) == '#') {
                continue; // ignore empty lines and comments
            }

            // TODO: we could process this more efficiently.
            String sides[] = split(line, "=>");
            if (sides.length > 1) { // explicit mapping
                if (sides.length != 2) {
                    throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
                }
                String inputStrings[] = split(sides[0], ",");
                CharsRef[] inputs = new CharsRef[inputStrings.length];
                for (int i = 0; i < inputs.length; i++) {
                    inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
                }

                String outputStrings[] = split(sides[1], ",");
                CharsRef[] outputs = new CharsRef[outputStrings.length];
                for (int i = 0; i < outputs.length; i++) {
                    outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRefBuilder());
                }
                // these mappings are explicit and never preserve original
                for (int i = 0; i < inputs.length; i++) {
                    for (int j = 0; j < outputs.length; j++) {
                        add(inputs[i], outputs[j], false);
                    }
                }
            } else {
                String inputStrings[] = split(line, ",");
                CharsRef[] inputs = new CharsRef[inputStrings.length];
                for (int i = 0; i < inputs.length; i++) {
                    inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
                }
                if (expand) {
                    // all pairs
                    for (int i = 0; i < inputs.length; i++) {
                        for (int j = 0; j < inputs.length; j++) {
                            if (i != j) {
                                add(inputs[i], inputs[j], true);
                            }
                        }
                    }
                } else {
                    // all subsequent inputs map to first one; we also add inputs[0] here
                    // so that we "effectively" (because we remove the original input and
                    // add back a synonym with the same text) change that token's type to
                    // SYNONYM (matching legacy behavior):
                    for (int i = 0; i < inputs.length; i++) {
                        add(inputs[i], inputs[0], false);
                    }
                }
            }
        }
    }

    private static String[] split(String s, String separator) {
        ArrayList<String> list = new ArrayList<>(2);
        StringBuilder sb = new StringBuilder();
        int pos = 0, end = s.length();
        while (pos < end) {
            if (s.startsWith(separator, pos)) {
                if (sb.length() > 0) {
                    list.add(sb.toString());
                    sb = new StringBuilder();
                }
                pos += separator.length();
                continue;
            }

            char ch = s.charAt(pos++);
            if (ch == '\\') {
                sb.append(ch);
                if (pos >= end)
                    break; // ERROR, or let it go?
                ch = s.charAt(pos++);
            }

            sb.append(ch);
        }

        if (sb.length() > 0) {
            list.add(sb.toString());
        }

        return list.toArray(new String[list.size()]);
    }

    private String unescape(String s) {
        if (s.indexOf("\\") >= 0) {
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < s.length(); i++) {
                char ch = s.charAt(i);
                if (ch == '\\' && i < s.length() - 1) {
                    sb.append(s.charAt(++i));
                } else {
                    sb.append(ch);
                }
            }
            return sb.toString();
        }
        return s;
    }
}