org.elasticsearch.index.analysis.CustomWordBoundaryStandardTokenizerFactory.java Source code

Introduction

Here is the source code for org.elasticsearch.index.analysis.CustomWordBoundaryStandardTokenizerFactory.java
Source

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.analysis;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.CustomWordBoundaryStandardTokenizer;
import org.apache.lucene.analysis.standard.CustomWordBoundaryStandardTokenizerImpl;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettingsService;
import org.elasticsearch.common.settings.Settings;
import org.apache.lucene.util.Version;

@AnalysisSettingsRequired
public class CustomWordBoundaryStandardTokenizerFactory extends AbstractTokenizerFactory {

    private final Map<Character, Character> characterMappings;

    @Inject
    public CustomWordBoundaryStandardTokenizerFactory(Index index, IndexSettingsService indexSettingsService,
            Environment environment, @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettingsService.getSettings(), name, settings);

        List<String> rules = Analysis.getWordList(environment, settings, "mappings");
        if (rules == null) {
            throw new IllegalArgumentException(
                    "mapping requires either `mappings` or `mappings_path` to be configured");
        }

        Map<Character, Character> map = new HashMap<>();
        parseRules(rules, map);
        characterMappings = Collections.unmodifiableMap(map);
    }

    @Override
    public Tokenizer create() {
        if (version.onOrAfter(Version.LUCENE_5_5_0)) {
            return new CustomWordBoundaryStandardTokenizer(characterMappings);
        } else {
            return new StandardTokenizer();
        }
    }

    // source => target
    private static Pattern rulePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");

    /**
     * parses a list of MappingCharFilter style rules into a normalize char map
     */
    private void parseRules(List<String> rules, Map<Character, Character> map) {
        for (String rule : rules) {
            Matcher m = rulePattern.matcher(rule);
            if (!m.find())
                throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
            String lhs = parseString(m.group(1).trim());
            String rhs = parseString(m.group(2).trim());
            if (lhs == null || rhs == null || lhs.length() > 1)
                throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping.");

            try {
                map.put(lhs.charAt(0), translateWordBoundary(rhs));
            } catch (IllegalArgumentException iae) {
                throw new RuntimeException(
                        "Invalid Mapping Rule : [" + rule + "]. Unrecognized WordBoundary property value");
            }
        }
    }

    public static enum WBProperty {
        L, N, EXNL, MNL, MN, ML, SQ, DQ, BRK
    }

    private Character translateWordBoundary(String mapping) throws IllegalArgumentException {
        WBProperty property = WBProperty.valueOf(mapping);
        switch (property) {
        case L:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_LETTER;
        case N:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_NUMERIC;
        case EXNL:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_EXTENDED_NUM_LETTER;
        case MNL:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_MID_NUMBER_LETTER;
        case MN:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_MID_NUMBER;
        case ML:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_MID_LETTER;
        case SQ:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_SINGLE_QUOTE;
        case DQ:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_DOUBLE_QUOTE;
        case BRK:
            return CustomWordBoundaryStandardTokenizerImpl.WB_CLASS_BREAK;
        default:
            throw new IllegalArgumentException();
        }
    }

    /**
     * Copied from {@link org.elasticsearch.index.analysis.MappingCharFilterFactory}
     */
    char[] out = new char[256];

    private String parseString(String s) {
        int readPos = 0;
        int len = s.length();
        int writePos = 0;
        while (readPos < len) {
            char c = s.charAt(readPos++);
            if (c == '\\') {
                if (readPos >= len)
                    throw new RuntimeException("Invalid escaped char in [" + s + "]");
                c = s.charAt(readPos++);
                switch (c) {
                case '\\':
                    c = '\\';
                    break;
                case 'n':
                    c = '\n';
                    break;
                case 't':
                    c = '\t';
                    break;
                case 'r':
                    c = '\r';
                    break;
                case 'b':
                    c = '\b';
                    break;
                case 'f':
                    c = '\f';
                    break;
                case 'u':
                    if (readPos + 3 >= len)
                        throw new RuntimeException("Invalid escaped char in [" + s + "]");
                    c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
                    readPos += 4;
                    break;
                }
            }
            out[writePos++] = c;
        }
        return new String(out, 0, writePos);
    }
}