Example usage for org.apache.lucene.analysis.util TokenizerFactory availableTokenizers

List of usage examples for org.apache.lucene.analysis.util TokenizerFactory availableTokenizers

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.util TokenizerFactory availableTokenizers.

Prototype

public static Set<String> availableTokenizers() 

Source Link

Document

returns a list of all available tokenizer names from context classpath

Usage

From source file:edu.cmu.lti.oaqa.annographix.solr.SolrTokenizerWrapper.java

License:Apache License

/**
 * A simple test function to check basic functionality manually.
 * //from ww  w .  j  a  v  a 2 s  . c o  m
 * @param args    args[0] is a tokenizer factory class; 
 *                args[1] is a sentence to tokenize;
 *                args[2] represents space-separated parameter pairs in the 
 *                        form key=value.
 * 
 * @throws Exception
 */
public static void main(String args[]) throws Exception {
    String tokClassName = args[0];
    String text = args[1];
    String params = args[2];

    // Let's print all available tokenizers
    for (String e : TokenizerFactory.availableTokenizers()) {
        System.out.println("###: " + e);
    }

    Map<String, String> tokClassArgs = new HashMap<String, String>();
    for (String part : params.split("\\s+"))
        if (!part.isEmpty()) {
            String tmp[] = part.split("=");
            tokClassArgs.put(tmp[0], tmp[1]);
            System.out.println(String.format("Adding a parameter: %s=%s", tmp[0], tmp[1]));
        }

    System.out.println("Creating a tokenizer: " + tokClassName);

    SolrTokenizerWrapper tokenizer = new SolrTokenizerWrapper(new TokenizerParams(tokClassName, tokClassArgs));

    for (AnnotationProxy e : tokenizer.tokenize(text, 100)) {
        e.debugPrint();
    }
}

From source file:org.apache.tika.eval.tokens.AnalyzerDeserializer.java

License:Apache License

private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
    if (!(map instanceof JsonObject)) {
        throw new IllegalArgumentException("Expecting a map with \"factory\" string and "
                + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName);
    }//  w  w  w.  j av a 2  s  . com
    JsonElement factoryEl = ((JsonObject) map).get(FACTORY);
    if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
        throw new IllegalArgumentException(
                "Expecting value for factory in char filter factory builder in:" + analyzerName);
    }
    String factoryName = factoryEl.getAsString();
    factoryName = factoryName.startsWith("oala.")
            ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.")
            : factoryName;

    JsonElement paramsEl = ((JsonObject) map).get(PARAMS);
    Map<String, String> params = mapify(paramsEl);
    String spiName = "";
    for (String s : TokenizerFactory.availableTokenizers()) {
        Class clazz = TokenizerFactory.lookupClass(s);
        if (clazz.getName().equals(factoryName)) {
            spiName = s;
            break;
        }
    }
    if (spiName.equals("")) {
        throw new IllegalArgumentException(
                "A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name" + "'"
                        + factoryName + "' does not exist.");
    }
    try {
        TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
        if (tokenizerFactory instanceof ResourceLoaderAware) {
            ((ResourceLoaderAware) tokenizerFactory)
                    .inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
        }

        return tokenizerFactory;
    } catch (IllegalArgumentException e) {
        throw new IllegalArgumentException("While working on " + analyzerName, e);
    }
}

From source file:org.tallison.gramreaper.ingest.schema.AnalyzerDeserializer.java

License:Apache License

private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
    if (!(map instanceof JsonObject)) {
        throw new IllegalArgumentException("Expecting a map with \"factory\" string and "
                + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName);
    }/*from  ww  w. ja v  a  2  s .c o  m*/
    JsonElement factoryEl = ((JsonObject) map).get(FACTORY);
    if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
        throw new IllegalArgumentException(
                "Expecting value for factory in char filter factory builder in:" + analyzerName);
    }
    String factoryName = factoryEl.getAsString();
    factoryName = factoryName.startsWith("oala.")
            ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.")
            : factoryName;

    JsonElement paramsEl = ((JsonObject) map).get(PARAMS);
    Map<String, String> params = mapify(paramsEl);
    String spiName = "";
    for (String s : TokenizerFactory.availableTokenizers()) {
        Class clazz = TokenizerFactory.lookupClass(s);
        if (clazz.getName().equals(factoryName)) {
            spiName = s;
            break;
        }
    }
    try {
        TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
        if (tokenizerFactory instanceof ResourceLoaderAware) {
            ((ResourceLoaderAware) tokenizerFactory)
                    .inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
        }

        return tokenizerFactory;
    } catch (IllegalArgumentException e) {
        throw new IllegalArgumentException("While working on " + analyzerName, e);
    }
}

From source file:org.zephyrsoft.sdb2.service.IndexerServiceImpl.java

License:Open Source License

@Override
public void index(final IndexType indexType, final Collection<Song> songs) {
    executor.execute(new Runnable() {
        @Override//w  w  w .  jav a2  s  . co  m
        public void run() {
            Stopwatch stopwatch = Stopwatch.createStarted();

            Directory directory = new RAMDirectory();
            try {
                LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers());
                LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters());
                Analyzer analyzer = CustomAnalyzer.builder().withTokenizer("standard")
                        .addTokenFilter("lowercase")
                        .addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25").build();
                IndexWriterConfig config = new IndexWriterConfig(analyzer);
                try (IndexWriter writer = new IndexWriter(directory, config)) {
                    for (Song song : songs) {
                        Document document = createDocument(song);
                        writer.addDocument(document);
                        songByUuid.put(song.getUUID(), song);
                    }
                } catch (IOException e) {
                    LOG.warn("couldn't index songs", e);
                }
            } catch (IOException e1) {
                LOG.warn("couldn't create analyzer", e1);
            } finally {
                putIndex(indexType, directory);
                stopwatch.stop();
                LOG.info("indexing songs in background thread took {}", stopwatch.toString());
            }
        }
    });
}