List of usage examples for org.apache.lucene.analysis.util TokenizerFactory availableTokenizers
public static Set<String> availableTokenizers()
From source file:edu.cmu.lti.oaqa.annographix.solr.SolrTokenizerWrapper.java
License:Apache License
/** * A simple test function to check basic functionality manually. * //from ww w . j a v a 2 s . c o m * @param args args[0] is a tokenizer factory class; * args[1] is a sentence to tokenize; * args[2] represents space-separated parameter pairs in the * form key=value. * * @throws Exception */ public static void main(String args[]) throws Exception { String tokClassName = args[0]; String text = args[1]; String params = args[2]; // Let's print all available tokenizers for (String e : TokenizerFactory.availableTokenizers()) { System.out.println("###: " + e); } Map<String, String> tokClassArgs = new HashMap<String, String>(); for (String part : params.split("\\s+")) if (!part.isEmpty()) { String tmp[] = part.split("="); tokClassArgs.put(tmp[0], tmp[1]); System.out.println(String.format("Adding a parameter: %s=%s", tmp[0], tmp[1])); } System.out.println("Creating a tokenizer: " + tokClassName); SolrTokenizerWrapper tokenizer = new SolrTokenizerWrapper(new TokenizerParams(tokClassName, tokClassArgs)); for (AnnotationProxy e : tokenizer.tokenize(text, 100)) { e.debugPrint(); } }
From source file:org.apache.tika.eval.tokens.AnalyzerDeserializer.java
License:Apache License
private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException { if (!(map instanceof JsonObject)) { throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName); }// w w w. j av a 2 s . com JsonElement factoryEl = ((JsonObject) map).get(FACTORY); if (factoryEl == null || !factoryEl.isJsonPrimitive()) { throw new IllegalArgumentException( "Expecting value for factory in char filter factory builder in:" + analyzerName); } String factoryName = factoryEl.getAsString(); factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; JsonElement paramsEl = ((JsonObject) map).get(PARAMS); Map<String, String> params = mapify(paramsEl); String spiName = ""; for (String s : TokenizerFactory.availableTokenizers()) { Class clazz = TokenizerFactory.lookupClass(s); if (clazz.getName().equals(factoryName)) { spiName = s; break; } } if (spiName.equals("")) { throw new IllegalArgumentException( "A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name" + "'" + factoryName + "' does not exist."); } try { TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params); if (tokenizerFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokenizerFactory) .inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); } return tokenizerFactory; } catch (IllegalArgumentException e) { throw new IllegalArgumentException("While working on " + analyzerName, e); } }
From source file:org.tallison.gramreaper.ingest.schema.AnalyzerDeserializer.java
License:Apache License
private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException { if (!(map instanceof JsonObject)) { throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName); }/*from ww w. ja v a 2 s .c o m*/ JsonElement factoryEl = ((JsonObject) map).get(FACTORY); if (factoryEl == null || !factoryEl.isJsonPrimitive()) { throw new IllegalArgumentException( "Expecting value for factory in char filter factory builder in:" + analyzerName); } String factoryName = factoryEl.getAsString(); factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; JsonElement paramsEl = ((JsonObject) map).get(PARAMS); Map<String, String> params = mapify(paramsEl); String spiName = ""; for (String s : TokenizerFactory.availableTokenizers()) { Class clazz = TokenizerFactory.lookupClass(s); if (clazz.getName().equals(factoryName)) { spiName = s; break; } } try { TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params); if (tokenizerFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokenizerFactory) .inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); } return tokenizerFactory; } catch (IllegalArgumentException e) { throw new IllegalArgumentException("While working on " + analyzerName, e); } }
From source file:org.zephyrsoft.sdb2.service.IndexerServiceImpl.java
License:Open Source License
@Override public void index(final IndexType indexType, final Collection<Song> songs) { executor.execute(new Runnable() { @Override//w w w . jav a2 s . co m public void run() { Stopwatch stopwatch = Stopwatch.createStarted(); Directory directory = new RAMDirectory(); try { LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers()); LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters()); Analyzer analyzer = CustomAnalyzer.builder().withTokenizer("standard") .addTokenFilter("lowercase") .addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25").build(); IndexWriterConfig config = new IndexWriterConfig(analyzer); try (IndexWriter writer = new IndexWriter(directory, config)) { for (Song song : songs) { Document document = createDocument(song); writer.addDocument(document); songByUuid.put(song.getUUID(), song); } } catch (IOException e) { LOG.warn("couldn't index songs", e); } } catch (IOException e1) { LOG.warn("couldn't create analyzer", e1); } finally { putIndex(indexType, directory); stopwatch.stop(); LOG.info("indexing songs in background thread took {}", stopwatch.toString()); } } }); }