Example usage for org.apache.lucene.analysis.custom CustomAnalyzer builder

List of usage examples for org.apache.lucene.analysis.custom CustomAnalyzer builder

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.custom CustomAnalyzer builder.

Prototype

public static Builder builder(ResourceLoader loader) 

Source Link

Document

Returns a builder for custom analyzers that loads all resources using the given ResourceLoader .

Usage

From source file:org.tallison.lucene.queryparser.spans.TestAdvancedAnalyzers.java

License:Apache License

@BeforeClass
public static void beforeClass() throws Exception {
    lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true);

    Map<String, String> attrs = new HashMap<>();
    attrs.put("generateWordParts", "1");
    attrs.put("generateNumberParts", "1");
    attrs.put("catenateWords", "1");
    attrs.put("catenateNumbers", "1");
    attrs.put("catenateAll", "1");
    attrs.put("splitOnCaseChange", "1");
    attrs.put("preserveOriginal", "1");
    complexAnalyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(TestAdvancedAnalyzers.class))
            .withTokenizer("whitespace").addTokenFilter("worddelimiter", attrs).addTokenFilter("kstem")
            .addTokenFilter("removeduplicates").build();

    synAnalyzer = new Analyzer() {
        @Override/*from  w  w w .j  a  va 2s. co  m*/
        public TokenStreamComponents createComponents(String fieldName) {

            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);

            filter = new MockSynFilter(filter);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockNonWhitespaceFilter(new MockSynFilter(in));
        }

    };

    baseAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockNonWhitespaceFilter(new LowerCaseFilter(in));
        }

    };

    ucVowelAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }
    };

    ucVowelMTAnalyzer = new Analyzer() {
        @Override
        public TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };

    Analyzer tmpUCVowelAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }
    };
    directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(baseAnalyzer)
            .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
    String[] docs = new String[] { "abc_def", "lmnop", "abc one", "abc two", "qrs one", "qrs two", "tuv one",
            "tuv two", "qrs tuv", "qrs_tuv" };
    for (int i = 0; i < docs.length; i++) {
        Document doc = new Document();
        doc.add(newTextField(FIELD1, docs[i], Field.Store.YES));
        TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES);
        tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i]));
        doc.add(tf);
        doc.add(newTextField(FIELD3, docs[i], Field.Store.YES));

        TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES);
        tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i]));
        doc.add(tf4);
        writer.addDocument(doc);
    }
    reader = writer.getReader();
    searcher = newSearcher(reader);
    writer.close();
}

From source file:practica2_1.Practica2_1.java

public static void main(String[] args) throws IOException, TikaException {
    Analyzer[] analizadores = { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StandardAnalyzer(),
            new EnglishAnalyzer(), new FrenchAnalyzer(), new FinnishAnalyzer(),
            CustomAnalyzer.builder(Paths.get("")).withTokenizer(StandardTokenizerFactory.class)
                    .addTokenFilter(LowerCaseFilterFactory.class)
                    .addTokenFilter(StopFilterFactory.class, "words",
                            "C:\\Users\\Javi\\Desktop\\RI\\practica2\\stopwords.txt" /*, "ignoreCase", "false", "words", "stopwords.txt", "format", "wordset"*/)
                    .build() };//from   w w w. j a v a  2s  . com
    File f = new File(args[0]);
    Tika tika = new Tika();
    if (f.exists()) {
        File[] ficheros = f.listFiles();
        for (int i = 0; i < ficheros.length; i++) {
            System.out.println(ficheros[i].getAbsolutePath());
        }
        for (int i = 0; i < ficheros.length; i++) {
            File f2 = new File(ficheros[i].getAbsolutePath());
            String text = tika.parseToString(f2);
            String language = identifyLanguage(text);
            List<String> result = new ArrayList<String>();
            String name = ficheros[i].getAbsolutePath();
            if (name.indexOf(".java") != -1) {
                result = tokenizeString(analizadores[6], text);
                process(result, ficheros[i].getAbsolutePath() + "_codeAnalyzer.txt");
            } else if (name.indexOf(".java") == -1)
                for (int j = 0; j < analizadores.length - 1; j++) {
                    List<String> result2 = new ArrayList<String>();
                    result2 = tokenizeString(analizadores[i], text);
                    if (j == 0)
                        process(result2, name + "_WhitespaceAnalyzer");
                    else if (j == 1)
                        process(result2, name + "_SimpleAnalyzer");
                    else if (j == 2)
                        process(result2, name + "_StandardAnalyzer");
                    else if (j == 3 && language.equals("en"))
                        process(result2, name + "_englishAnalyzer");
                    else if (j == 4 && language.equals("fr"))
                        process(result2, name + "_frenchAnalyzer");
                    else if (j == 5 && language.equals("fi"))
                        process(result2, name + "_finnishAnalyzer");
                }
        }
    }
}