List of usage examples for org.apache.lucene.analysis.custom CustomAnalyzer builder
public static Builder builder(ResourceLoader loader)
From source file:org.tallison.lucene.queryparser.spans.TestAdvancedAnalyzers.java
License:Apache License
@BeforeClass public static void beforeClass() throws Exception { lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true); Map<String, String> attrs = new HashMap<>(); attrs.put("generateWordParts", "1"); attrs.put("generateNumberParts", "1"); attrs.put("catenateWords", "1"); attrs.put("catenateNumbers", "1"); attrs.put("catenateAll", "1"); attrs.put("splitOnCaseChange", "1"); attrs.put("preserveOriginal", "1"); complexAnalyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(TestAdvancedAnalyzers.class)) .withTokenizer("whitespace").addTokenFilter("worddelimiter", attrs).addTokenFilter("kstem") .addTokenFilter("removeduplicates").build(); synAnalyzer = new Analyzer() { @Override/*from w w w .j a va 2s. co m*/ public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockNonWhitespaceFilter(tokenizer); filter = new MockSynFilter(filter); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockNonWhitespaceFilter(new MockSynFilter(in)); } }; baseAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockNonWhitespaceFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockNonWhitespaceFilter(new LowerCaseFilter(in)); } }; ucVowelAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } }; ucVowelMTAnalyzer = new Analyzer() { @Override public TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } }; Analyzer tmpUCVowelAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } }; directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(baseAnalyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); String[] docs = new String[] { "abc_def", "lmnop", "abc one", "abc two", "qrs one", "qrs two", "tuv one", "tuv two", "qrs tuv", "qrs_tuv" }; for (int i = 0; i < docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD1, docs[i], Field.Store.YES)); TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES); tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i])); doc.add(tf); doc.add(newTextField(FIELD3, docs[i], Field.Store.YES)); TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES); tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i])); doc.add(tf4); writer.addDocument(doc); } reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); }
From source file:practica2_1.Practica2_1.java
public static void main(String[] args) throws IOException, TikaException { Analyzer[] analizadores = { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StandardAnalyzer(), new EnglishAnalyzer(), new FrenchAnalyzer(), new FinnishAnalyzer(), CustomAnalyzer.builder(Paths.get("")).withTokenizer(StandardTokenizerFactory.class) .addTokenFilter(LowerCaseFilterFactory.class) .addTokenFilter(StopFilterFactory.class, "words", "C:\\Users\\Javi\\Desktop\\RI\\practica2\\stopwords.txt" /*, "ignoreCase", "false", "words", "stopwords.txt", "format", "wordset"*/) .build() };//from w w w. j a v a 2s . com File f = new File(args[0]); Tika tika = new Tika(); if (f.exists()) { File[] ficheros = f.listFiles(); for (int i = 0; i < ficheros.length; i++) { System.out.println(ficheros[i].getAbsolutePath()); } for (int i = 0; i < ficheros.length; i++) { File f2 = new File(ficheros[i].getAbsolutePath()); String text = tika.parseToString(f2); String language = identifyLanguage(text); List<String> result = new ArrayList<String>(); String name = ficheros[i].getAbsolutePath(); if (name.indexOf(".java") != -1) { result = tokenizeString(analizadores[6], text); process(result, ficheros[i].getAbsolutePath() + "_codeAnalyzer.txt"); } else if (name.indexOf(".java") == -1) for (int j = 0; j < analizadores.length - 1; j++) { List<String> result2 = new ArrayList<String>(); result2 = tokenizeString(analizadores[i], text); if (j == 0) process(result2, name + "_WhitespaceAnalyzer"); else if (j == 1) process(result2, name + "_SimpleAnalyzer"); else if (j == 2) process(result2, name + "_StandardAnalyzer"); else if (j == 3 && language.equals("en")) process(result2, name + "_englishAnalyzer"); else if (j == 4 && language.equals("fr")) process(result2, name + "_frenchAnalyzer"); else if (j == 5 && language.equals("fi")) process(result2, name + "_finnishAnalyzer"); } } } }