List of usage examples for org.apache.lucene.analysis.util TokenizerFactory forName
public static TokenizerFactory forName(String name, Map<String, String> args)
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
private void parseConfig(Map<String, Analyzer> analyzers, String argName) { try {//ww w . j av a2s . c o m Object xmlAnalyzers = args.get(argName); if (xmlAnalyzers != null && xmlAnalyzers instanceof NamedList) { NamedList<?> AnalyzersList = (NamedList<?>) xmlAnalyzers; for (Entry<String, ?> entry : AnalyzersList) { String analyzerName = entry.getKey(); if (!(entry.getValue() instanceof NamedList)) { continue; } NamedList<?> analyzerAsNamedList = (NamedList<?>) entry.getValue(); TokenizerFactory tokenizerFactory = null; TokenFilterFactory filterFactory; List<TokenFilterFactory> filterFactories = new LinkedList<>(); for (Entry<String, ?> analyzerEntry : analyzerAsNamedList) { String key = analyzerEntry.getKey(); if (!(entry.getValue() instanceof NamedList)) { continue; } Map<String, String> params = convertNamedListToMap((NamedList<?>) analyzerEntry.getValue()); String className = params.get("class"); if (className == null) { continue; } params.put("luceneMatchVersion", luceneMatchVersion.toString()); if (key.equals("tokenizer")) { try { tokenizerFactory = TokenizerFactory.forName(className, params); } catch (IllegalArgumentException iae) { if (!className.contains(".")) { iae.printStackTrace(); } // Now try by classname instead of SPI keyword tokenizerFactory = loader.newInstance(className, TokenizerFactory.class, new String[] {}, new Class[] { Map.class }, new Object[] { params }); } if (tokenizerFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokenizerFactory).inform(loader); } } else if (key.equals("filter")) { try { filterFactory = TokenFilterFactory.forName(className, params); } catch (IllegalArgumentException iae) { if (!className.contains(".")) { iae.printStackTrace(); } // Now try by classname instead of SPI keyword filterFactory = loader.newInstance(className, TokenFilterFactory.class, new String[] {}, new Class[] { Map.class }, new Object[] { params }); } if (filterFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) filterFactory).inform(loader); } filterFactories.add(filterFactory); } } if (tokenizerFactory == null) { throw new SolrException(ErrorCode.SERVER_ERROR, "tokenizer must not be null for analyzer: " + analyzerName); } else if (filterFactories.isEmpty()) { throw new SolrException(ErrorCode.SERVER_ERROR, "filter factories must be defined for analyzer: " + analyzerName); } TokenizerChain analyzer = new TokenizerChain(tokenizerFactory, filterFactories.toArray(new TokenFilterFactory[filterFactories.size()])); analyzers.put(analyzerName, analyzer); } } } catch (IOException e) { throw new SolrException(ErrorCode.SERVER_ERROR, "Failed to create parser. Check your config.", e); } }
From source file:com.grantingersoll.opengrok.analysis.TokenizerGuru.java
License:Open Source License
/** * Reads in the given properties file, where keys are tokenizer SPI names and values are * space-and-or-comma delimited, and adds each list value as a key, with an instantiated * tokenizer for the given SPI name as its value. * * @param map The map to populate/*from w w w. j a va 2 s.c om*/ * @param propertiesFile The name of the resource to read in as a properties file * @param factorySingletons map of tokenizer SPI names to tokenizer factory, to be used to limit * tokenizer factory instantiation to one per factory class. * @return If the map is sorted, returns a regex alternation of all map keys, otherwise null. */ private static String populateTokenizerFactoryMap(Map<String, SymbolTokenizerFactory> map, String propertiesFile, Map<String, SymbolTokenizerFactory> factorySingletons) throws IOException { Properties values = new Properties(); InputStream stream = TokenizerGuru.class.getResourceAsStream(propertiesFile); values.load(stream); for (Map.Entry<Object, Object> entry : values.entrySet()) { String tokenizerSPIname = (String) entry.getKey(); String valueList = (String) entry.getValue(); SymbolTokenizerFactory factory = factorySingletons.get(tokenizerSPIname); if (factory == null) { factory = (SymbolTokenizerFactory) TokenizerFactory.forName(tokenizerSPIname, Collections.<String, String>emptyMap()); if (factory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) factory).inform(CLASSPATH_RESOURCE_LOADER); } factorySingletons.put(tokenizerSPIname, factory); } // Ignore backslash-escaped value list separators for (String value : valueList.trim().split("(?<!\\\\)(?:\\s*,\\s*|\\s+)")) { value = value.replaceAll("\\A\\s+|(?<!\\\\)\\s+\\z", ""); // trim non-escaped prefix/suffix whitespace value = value.replaceAll("\\\\(.)", "$1"); // unescape backslash-escaped chars map.put(value, factory); } } // The regex is only needed for prefix matching, in which case the map keys are sorted longest-first String regex = null; if (map instanceof SortedMap) { StringBuilder regexBuilder = new StringBuilder(); for (String key : map.keySet()) { if (regexBuilder.length() > 0) { regexBuilder.append("|"); } regexBuilder.append(Pattern.quote(key)); } regex = regexBuilder.toString(); } return regex; }
From source file:edu.cmu.lti.oaqa.annographix.solr.SolrTokenizerWrapper.java
License:Apache License
/** * A tokenizer: shouldn't be shared among threads (i.e., each thread * should use its own copy)./*from w ww .j ava 2 s .com*/ * * <p> * The tokenizer can be specified using a short or full class name. * However, only the full class name can be used to directly initialize * the tokenizer. In the case of a short name, we need to use * a tokenizer ID corresponding to this short name. For example, * the ID for <b>solr.StandardTokenizerFactory</b> is <b>standard</b>. * </p> * <p> * Sadly, Leo couldn't find a standard an API function to obtain this * short tokenizer ID from a name specified in a configuration file. * Thus, a custom code is used, which guesses this short ID based * on the short class name specified in the config. If this guessing * code fails for any reason (e.g., it may stop working in * future Lucene/SOLR versions), one can specify the full class name * in the SOLR schema file. However, it seems to be working fine * with Standard SOLR tokenizer factories. * </p> * <p> * For instance, instead of <br> * <tokenizer class="solr.StandardTokenizerFactory" maxTokenLength="255"/><br> * one can explicitly specify the full class name:<br> * <tokenizer class="org.apache.lucene.analysis.standard.StandardTokenizerFactory" maxTokenLength="255" /><br> * </p> * * @param params an objects specifying a name of the tokenizer class, * as well as tokenizer's parameters. * * @throws ClassNotFoundException * @throws SecurityException * @throws NoSuchMethodException * @throws InvocationTargetException * @throws IllegalArgumentException * @throws IllegalAccessException * @throws InstantiationException */ public SolrTokenizerWrapper(TokenizerParams params) throws InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, ClassNotFoundException { String tokClassName = params.getTokClassName(); Map<String, String> tokClassArgs = params.getTokClassArgs(); if (tokClassName.startsWith(SHORT_NAME_PREFIX) && tokClassName.endsWith(SHORT_NAME_SUFFIX)) { /* * If it is a standard SOLR name, let's try to rewrite the class * using the rewriting convention. */ tokClassName = tokClassName.substring(SHORT_NAME_PREFIX.length()); tokClassName = tokClassName.substring(0, tokClassName.length() - SHORT_NAME_SUFFIX.length()); tokClassName = tokClassName.toLowerCase(); mTokStreamFactory = (TokenizerFactory) TokenizerFactory.forName(tokClassName, tokClassArgs); } else { // Load by full class name mTokStreamFactory = (TokenizerFactory) Class.forName(tokClassName).asSubclass(TokenizerFactory.class) .getConstructor(Map.class).newInstance(tokClassArgs); } }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.NodeStateAnalyzerFactory.java
License:Apache License
private TokenizerFactory loadTokenizer(NodeState state) { String clazz = checkNotNull(state.getString(LuceneIndexConstants.ANL_NAME)); Map<String, String> args = convertNodeState(state); TokenizerFactory tf = TokenizerFactory.forName(clazz, args); init(tf, state);/* w w w. ja v a 2s . c om*/ return tf; }
From source file:org.apache.tika.eval.tokens.AnalyzerDeserializer.java
License:Apache License
private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException { if (!(map instanceof JsonObject)) { throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName); }/*from w ww .j ava2s . c om*/ JsonElement factoryEl = ((JsonObject) map).get(FACTORY); if (factoryEl == null || !factoryEl.isJsonPrimitive()) { throw new IllegalArgumentException( "Expecting value for factory in char filter factory builder in:" + analyzerName); } String factoryName = factoryEl.getAsString(); factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; JsonElement paramsEl = ((JsonObject) map).get(PARAMS); Map<String, String> params = mapify(paramsEl); String spiName = ""; for (String s : TokenizerFactory.availableTokenizers()) { Class clazz = TokenizerFactory.lookupClass(s); if (clazz.getName().equals(factoryName)) { spiName = s; break; } } if (spiName.equals("")) { throw new IllegalArgumentException( "A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name" + "'" + factoryName + "' does not exist."); } try { TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params); if (tokenizerFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokenizerFactory) .inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); } return tokenizerFactory; } catch (IllegalArgumentException e) { throw new IllegalArgumentException("While working on " + analyzerName, e); } }
From source file:org.tallison.gramreaper.ingest.schema.AnalyzerDeserializer.java
License:Apache License
private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException { if (!(map instanceof JsonObject)) { throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName); }/*from www. j a va2 s.c om*/ JsonElement factoryEl = ((JsonObject) map).get(FACTORY); if (factoryEl == null || !factoryEl.isJsonPrimitive()) { throw new IllegalArgumentException( "Expecting value for factory in char filter factory builder in:" + analyzerName); } String factoryName = factoryEl.getAsString(); factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; JsonElement paramsEl = ((JsonObject) map).get(PARAMS); Map<String, String> params = mapify(paramsEl); String spiName = ""; for (String s : TokenizerFactory.availableTokenizers()) { Class clazz = TokenizerFactory.lookupClass(s); if (clazz.getName().equals(factoryName)) { spiName = s; break; } } try { TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params); if (tokenizerFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokenizerFactory) .inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); } return tokenizerFactory; } catch (IllegalArgumentException e) { throw new IllegalArgumentException("While working on " + analyzerName, e); } }