Example usage for org.apache.lucene.analysis.miscellaneous WordDelimiterFilterFactory WordDelimiterFilterFactory

List of usage examples for org.apache.lucene.analysis.miscellaneous WordDelimiterFilterFactory WordDelimiterFilterFactory

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.miscellaneous WordDelimiterFilterFactory WordDelimiterFilterFactory.

Prototype

public WordDelimiterFilterFactory(Map<String, String> args) 

Source Link

Document

Creates a new WordDelimiterFilterFactory

Usage

From source file:com.xiaomi.linden.lucene.analyzer.LindenWordDelimiterAnalyzer.java

License:Apache License

/**
 * generateWordParts/*from  w  w  w . jav a 2s  . c  o  m*/
 * Causes parts of words to be generated:
 * <p/>
 * "PowerShot" => "Power" "Shot"
 * <p>
 * generateNumberParts
 * Causes number subwords to be generated:
 * <p/>
 * "500-42" => "500" "42"
 * <p>
 * catenateWords
 * Causes maximum runs of word parts to be catenated:
 * <p/>
 * "wi-fi" => "wifi"
 * <p>
 * catenateNumbers
 * Causes maximum runs of word parts to be catenated:
 * <p/>
 * "500-42" => "50042"
 * <p>
 * catenateAll
 * Causes all subword parts to be catenated:
 * <p/>
 * "wi-fi-4000" => "wifi4000"
 * <p>
 * preserveOriginal
 * Causes original words are preserved and added to the subword list (Defaults to false)
 * <p/>
 * "500-42" => "500" "42" "500-42"
 * <p>
 * splitOnCaseChange
 * If not set, causes case changes to be ignored (subwords will only be generated
 * given SUBWORD_DELIM tokens)
 * <p>
 * splitOnNumerics
 * If not set, causes numeric changes to be ignored (subwords will only be generated
 * given SUBWORD_DELIM tokens).
 * <p>
 * stemEnglishPossessive
 * Causes trailing "'s" to be removed for each subword
 * <p/>
 * "O'Neil's" => "O", "Neil"
 */

public LindenWordDelimiterAnalyzer(Map<String, String> params) {
    if (params.containsKey(SET_STOP_WORDS)) {
        this.setStopWords = Boolean.parseBoolean(params.get(SET_STOP_WORDS));
        params.remove(SET_STOP_WORDS);
    }
    if (params.containsKey(TO_LOWER_CASE)) {
        this.toLowerCase = Boolean.parseBoolean(params.get(TO_LOWER_CASE));
        params.remove(TO_LOWER_CASE);
    }
    factoryDefault = new WordDelimiterFilterFactory(params);
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilterFactory.java

License:Apache License

@Test
public void testCustomTypes() throws Exception {
    String testText = "I borrowed $5,400.00 at 25% interest-rate";
    ResourceLoader loader = new SolrResourceLoader("solr/collection1");
    Map<String, String> args = new HashMap<String, String>();
    args.put("generateWordParts", "1");
    args.put("generateNumberParts", "1");
    args.put("catenateWords", "1");
    args.put("catenateNumbers", "1");
    args.put("catenateAll", "0");
    args.put("splitOnCaseChange", "1");

    /* default behavior */
    WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(args);
    factoryDefault.inform(loader);//  w  w w  . j a  va2s  .  c o m

    TokenStream ts = factoryDefault
            .create(new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false));
    BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "5", "400", "00",
            "540000", "at", "25", "interest", "rate", "interestrate" });

    ts = factoryDefault
            .create(new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false));
    BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo", "bar", "foobar" });

    /* custom behavior */
    args = new HashMap<String, String>();
    // use a custom type mapping
    args.put("generateWordParts", "1");
    args.put("generateNumberParts", "1");
    args.put("catenateWords", "1");
    args.put("catenateNumbers", "1");
    args.put("catenateAll", "0");
    args.put("splitOnCaseChange", "1");
    args.put("types", "wdftypes.txt");
    WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(args);
    factoryCustom.inform(loader);

    ts = factoryCustom.create(new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false));
    BaseTokenStreamTestCase.assertTokenStreamContents(ts,
            new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" });

    /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
    ts = factoryCustom
            .create(new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false));
    BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo\u200Dbar" });
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.config.LuceneIAViewConfiguration.java

License:Mozilla Public License

public @Bean WordDelimiterFilterFactory wordDelimiterFilterFactory() {
    Map<String, String> wordDelimiterFilterArgs = new HashMap<String, String>();
    wordDelimiterFilterArgs.put("preserveOriginal", "1");
    wordDelimiterFilterArgs.put("generateWordParts", "1");
    wordDelimiterFilterArgs.put("catenateWords", "1");
    wordDelimiterFilterArgs.put("luceneMatchVersion", version);
    WordDelimiterFilterFactory wordDelimiterFilterFactory = new WordDelimiterFilterFactory(
            wordDelimiterFilterArgs);/*  www.  j a  v a2 s .com*/

    try {
        ResourceLoader loader = new ClasspathResourceLoader(getClass());
        wordDelimiterFilterFactory.inform(loader);
    } catch (IOException e) {
        logger.error(".wordDelimiterFilterFactory: an error occured while creating the Filter factory: {}",
                e.getMessage());
    }
    return wordDelimiterFilterFactory;

}