Example usage for org.apache.lucene.analysis.util TokenizerFactory forName

List of usage examples for org.apache.lucene.analysis.util TokenizerFactory forName

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.util TokenizerFactory forName.

Prototype

public static TokenizerFactory forName(String name, Map<String, String> args) 

Source Link

Document

looks up a tokenizer by name from context classpath

Usage

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

private void parseConfig(Map<String, Analyzer> analyzers, String argName) {
    try {//ww  w  .  j av  a2s .  c  o  m

        Object xmlAnalyzers = args.get(argName);

        if (xmlAnalyzers != null && xmlAnalyzers instanceof NamedList) {
            NamedList<?> AnalyzersList = (NamedList<?>) xmlAnalyzers;
            for (Entry<String, ?> entry : AnalyzersList) {
                String analyzerName = entry.getKey();
                if (!(entry.getValue() instanceof NamedList)) {
                    continue;
                }
                NamedList<?> analyzerAsNamedList = (NamedList<?>) entry.getValue();

                TokenizerFactory tokenizerFactory = null;
                TokenFilterFactory filterFactory;
                List<TokenFilterFactory> filterFactories = new LinkedList<>();

                for (Entry<String, ?> analyzerEntry : analyzerAsNamedList) {
                    String key = analyzerEntry.getKey();
                    if (!(entry.getValue() instanceof NamedList)) {
                        continue;
                    }
                    Map<String, String> params = convertNamedListToMap((NamedList<?>) analyzerEntry.getValue());

                    String className = params.get("class");
                    if (className == null) {
                        continue;
                    }

                    params.put("luceneMatchVersion", luceneMatchVersion.toString());

                    if (key.equals("tokenizer")) {
                        try {
                            tokenizerFactory = TokenizerFactory.forName(className, params);
                        } catch (IllegalArgumentException iae) {
                            if (!className.contains(".")) {
                                iae.printStackTrace();
                            }
                            // Now try by classname instead of SPI keyword
                            tokenizerFactory = loader.newInstance(className, TokenizerFactory.class,
                                    new String[] {}, new Class[] { Map.class }, new Object[] { params });
                        }
                        if (tokenizerFactory instanceof ResourceLoaderAware) {
                            ((ResourceLoaderAware) tokenizerFactory).inform(loader);
                        }
                    } else if (key.equals("filter")) {
                        try {
                            filterFactory = TokenFilterFactory.forName(className, params);
                        } catch (IllegalArgumentException iae) {
                            if (!className.contains(".")) {
                                iae.printStackTrace();
                            }
                            // Now try by classname instead of SPI keyword
                            filterFactory = loader.newInstance(className, TokenFilterFactory.class,
                                    new String[] {}, new Class[] { Map.class }, new Object[] { params });
                        }
                        if (filterFactory instanceof ResourceLoaderAware) {
                            ((ResourceLoaderAware) filterFactory).inform(loader);
                        }
                        filterFactories.add(filterFactory);
                    }
                }
                if (tokenizerFactory == null) {
                    throw new SolrException(ErrorCode.SERVER_ERROR,
                            "tokenizer must not be null for analyzer: " + analyzerName);
                } else if (filterFactories.isEmpty()) {
                    throw new SolrException(ErrorCode.SERVER_ERROR,
                            "filter factories must be defined for analyzer: " + analyzerName);
                }

                TokenizerChain analyzer = new TokenizerChain(tokenizerFactory,
                        filterFactories.toArray(new TokenFilterFactory[filterFactories.size()]));

                analyzers.put(analyzerName, analyzer);
            }
        }
    } catch (IOException e) {
        throw new SolrException(ErrorCode.SERVER_ERROR, "Failed to create parser. Check your config.", e);
    }
}

From source file:com.grantingersoll.opengrok.analysis.TokenizerGuru.java

License:Open Source License

/**
 * Reads in the given properties file, where keys are tokenizer SPI names and values are
 * space-and-or-comma delimited, and adds each list value as a key, with an instantiated
 * tokenizer for the given SPI name as its value.
 *
 * @param map The map to populate/*from w  w w. j  a va 2 s.c om*/
 * @param propertiesFile The name of the resource to read in as a properties file
 * @param factorySingletons map of tokenizer SPI names to tokenizer factory, to be used to limit
 *                          tokenizer factory instantiation to one per factory class.
 * @return If the map is sorted, returns a regex alternation of all map keys, otherwise null.
 */
private static String populateTokenizerFactoryMap(Map<String, SymbolTokenizerFactory> map,
        String propertiesFile, Map<String, SymbolTokenizerFactory> factorySingletons) throws IOException {
    Properties values = new Properties();
    InputStream stream = TokenizerGuru.class.getResourceAsStream(propertiesFile);
    values.load(stream);
    for (Map.Entry<Object, Object> entry : values.entrySet()) {
        String tokenizerSPIname = (String) entry.getKey();
        String valueList = (String) entry.getValue();
        SymbolTokenizerFactory factory = factorySingletons.get(tokenizerSPIname);
        if (factory == null) {
            factory = (SymbolTokenizerFactory) TokenizerFactory.forName(tokenizerSPIname,
                    Collections.<String, String>emptyMap());
            if (factory instanceof ResourceLoaderAware) {
                ((ResourceLoaderAware) factory).inform(CLASSPATH_RESOURCE_LOADER);
            }
            factorySingletons.put(tokenizerSPIname, factory);
        }
        // Ignore backslash-escaped value list separators
        for (String value : valueList.trim().split("(?<!\\\\)(?:\\s*,\\s*|\\s+)")) {
            value = value.replaceAll("\\A\\s+|(?<!\\\\)\\s+\\z", ""); // trim non-escaped prefix/suffix whitespace
            value = value.replaceAll("\\\\(.)", "$1"); // unescape backslash-escaped chars
            map.put(value, factory);
        }
    }
    // The regex is only needed for prefix matching, in which case the map keys are sorted longest-first
    String regex = null;
    if (map instanceof SortedMap) {
        StringBuilder regexBuilder = new StringBuilder();
        for (String key : map.keySet()) {
            if (regexBuilder.length() > 0) {
                regexBuilder.append("|");
            }
            regexBuilder.append(Pattern.quote(key));
        }
        regex = regexBuilder.toString();
    }
    return regex;
}

From source file:edu.cmu.lti.oaqa.annographix.solr.SolrTokenizerWrapper.java

License:Apache License

/**
 * A tokenizer: shouldn't be shared among threads (i.e., each thread
 * should use its own copy)./*from w  ww  .j  ava 2  s .com*/
 * 
 * <p>
 * The tokenizer can be specified using a short or full class name.
 * However, only the full class name can be used to directly initialize
 * the tokenizer. In the case of a short name, we need to use
 * a tokenizer ID corresponding to this short name. For example,
 * the ID for <b>solr.StandardTokenizerFactory</b> is <b>standard</b>. 
 * </p>
 * <p>
 * Sadly, Leo couldn't find a standard an API function to obtain this
 * short tokenizer ID from a name specified in a configuration file.
 * Thus, a custom code is used, which guesses this short ID based
 * on the short class name specified in the config. If this guessing 
 * code fails for any reason (e.g., it may stop working in 
 * future Lucene/SOLR versions), one can specify the full class name 
 * in the SOLR schema file. However, it seems to be working fine
 * with Standard SOLR tokenizer factories.
 * </p>
 * <p>
 * For instance, instead of <br>
 * &lt;tokenizer class="solr.StandardTokenizerFactory" maxTokenLength="255"/&gt;<br>
 * one can explicitly specify the full class name:<br>
 * &lt;tokenizer 
class="org.apache.lucene.analysis.standard.StandardTokenizerFactory" 
maxTokenLength="255" /&gt;<br>        
 * </p>
 * 
 * @param params  an objects specifying a name of the tokenizer class,
 *                as well as tokenizer's parameters.
 *                        
 * @throws ClassNotFoundException 
 * @throws SecurityException 
 * @throws NoSuchMethodException 
 * @throws InvocationTargetException 
 * @throws IllegalArgumentException 
 * @throws IllegalAccessException 
 * @throws InstantiationException 
 */
public SolrTokenizerWrapper(TokenizerParams params)
        throws InstantiationException, IllegalAccessException, IllegalArgumentException,
        InvocationTargetException, NoSuchMethodException, SecurityException, ClassNotFoundException {
    String tokClassName = params.getTokClassName();

    Map<String, String> tokClassArgs = params.getTokClassArgs();

    if (tokClassName.startsWith(SHORT_NAME_PREFIX) && tokClassName.endsWith(SHORT_NAME_SUFFIX)) {
        /*
         * If it is a standard SOLR name, let's try to rewrite the class
         * using the rewriting convention.
         */

        tokClassName = tokClassName.substring(SHORT_NAME_PREFIX.length());
        tokClassName = tokClassName.substring(0, tokClassName.length() - SHORT_NAME_SUFFIX.length());
        tokClassName = tokClassName.toLowerCase();

        mTokStreamFactory = (TokenizerFactory) TokenizerFactory.forName(tokClassName, tokClassArgs);
    } else {
        // Load by full class name
        mTokStreamFactory = (TokenizerFactory) Class.forName(tokClassName).asSubclass(TokenizerFactory.class)
                .getConstructor(Map.class).newInstance(tokClassArgs);
    }
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.NodeStateAnalyzerFactory.java

License:Apache License

private TokenizerFactory loadTokenizer(NodeState state) {
    String clazz = checkNotNull(state.getString(LuceneIndexConstants.ANL_NAME));
    Map<String, String> args = convertNodeState(state);
    TokenizerFactory tf = TokenizerFactory.forName(clazz, args);
    init(tf, state);/*  w  w  w. ja v a 2s  . c om*/
    return tf;
}

From source file:org.apache.tika.eval.tokens.AnalyzerDeserializer.java

License:Apache License

private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
    if (!(map instanceof JsonObject)) {
        throw new IllegalArgumentException("Expecting a map with \"factory\" string and "
                + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName);
    }/*from  w  ww .j ava2s  .  c om*/
    JsonElement factoryEl = ((JsonObject) map).get(FACTORY);
    if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
        throw new IllegalArgumentException(
                "Expecting value for factory in char filter factory builder in:" + analyzerName);
    }
    String factoryName = factoryEl.getAsString();
    factoryName = factoryName.startsWith("oala.")
            ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.")
            : factoryName;

    JsonElement paramsEl = ((JsonObject) map).get(PARAMS);
    Map<String, String> params = mapify(paramsEl);
    String spiName = "";
    for (String s : TokenizerFactory.availableTokenizers()) {
        Class clazz = TokenizerFactory.lookupClass(s);
        if (clazz.getName().equals(factoryName)) {
            spiName = s;
            break;
        }
    }
    if (spiName.equals("")) {
        throw new IllegalArgumentException(
                "A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name" + "'"
                        + factoryName + "' does not exist.");
    }
    try {
        TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
        if (tokenizerFactory instanceof ResourceLoaderAware) {
            ((ResourceLoaderAware) tokenizerFactory)
                    .inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
        }

        return tokenizerFactory;
    } catch (IllegalArgumentException e) {
        throw new IllegalArgumentException("While working on " + analyzerName, e);
    }
}

From source file:org.tallison.gramreaper.ingest.schema.AnalyzerDeserializer.java

License:Apache License

private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
    if (!(map instanceof JsonObject)) {
        throw new IllegalArgumentException("Expecting a map with \"factory\" string and "
                + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName);
    }/*from  www. j  a va2 s.c  om*/
    JsonElement factoryEl = ((JsonObject) map).get(FACTORY);
    if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
        throw new IllegalArgumentException(
                "Expecting value for factory in char filter factory builder in:" + analyzerName);
    }
    String factoryName = factoryEl.getAsString();
    factoryName = factoryName.startsWith("oala.")
            ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.")
            : factoryName;

    JsonElement paramsEl = ((JsonObject) map).get(PARAMS);
    Map<String, String> params = mapify(paramsEl);
    String spiName = "";
    for (String s : TokenizerFactory.availableTokenizers()) {
        Class clazz = TokenizerFactory.lookupClass(s);
        if (clazz.getName().equals(factoryName)) {
            spiName = s;
            break;
        }
    }
    try {
        TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
        if (tokenizerFactory instanceof ResourceLoaderAware) {
            ((ResourceLoaderAware) tokenizerFactory)
                    .inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
        }

        return tokenizerFactory;
    } catch (IllegalArgumentException e) {
        throw new IllegalArgumentException("While working on " + analyzerName, e);
    }
}