List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer getDefaultStopSet
public static CharArraySet getDefaultStopSet()
From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java
License:Apache License
protected TokenStream createTokenStream(String text) { Set<?> luceneStopWords = this.stopWords == null ? EnglishAnalyzer.getDefaultStopSet() : StopFilter.makeStopSet(LUCENE_VERSION, stopWords); Analyzer analyzer = new EnglishSpecialAnalyzer(LUCENE_VERSION, luceneStopWords, this.stemExclusionsSet); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); if (this.nGram) { tokenStream = new ShingleFilter(tokenStream, this.minNGram, this.maxNGram); }// ww w . j av a 2s . c om return tokenStream; }
From source file:com.stratio.cassandra.lucene.schema.analysis.SnowballAnalyzerBuilder.java
License:Apache License
/** * Returns the default stopwords set used by Lucene language analyzer for the specified language. * * @param language The language for which the stopwords are. The supported languages are English, French, Spanish, * Portuguese, Italian, Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, * Irish, Hungarian, Turkish, Armenian, Basque and Catalan. * @return The default stopwords set used by Lucene language analyzers. *//* w ww . j a v a 2 s .c o m*/ private static CharArraySet getDefaultStopwords(String language) { switch (language) { case "English": return EnglishAnalyzer.getDefaultStopSet(); case "French": return FrenchAnalyzer.getDefaultStopSet(); case "Spanish": return SpanishAnalyzer.getDefaultStopSet(); case "Portuguese": return PortugueseAnalyzer.getDefaultStopSet(); case "Italian": return ItalianAnalyzer.getDefaultStopSet(); case "Romanian": return RomanianAnalyzer.getDefaultStopSet(); case "German": return GermanAnalyzer.getDefaultStopSet(); case "Dutch": return DutchAnalyzer.getDefaultStopSet(); case "Swedish": return SwedishAnalyzer.getDefaultStopSet(); case "Norwegian": return NorwegianAnalyzer.getDefaultStopSet(); case "Danish": return DanishAnalyzer.getDefaultStopSet(); case "Russian": return RussianAnalyzer.getDefaultStopSet(); case "Finnish": return FinnishAnalyzer.getDefaultStopSet(); case "Irish": return IrishAnalyzer.getDefaultStopSet(); case "Hungarian": return HungarianAnalyzer.getDefaultStopSet(); case "Turkish": return SpanishAnalyzer.getDefaultStopSet(); case "Armenian": return SpanishAnalyzer.getDefaultStopSet(); case "Basque": return BasqueAnalyzer.getDefaultStopSet(); case "Catalan": return CatalanAnalyzer.getDefaultStopSet(); default: return CharArraySet.EMPTY_SET; } }
From source file:com.stratio.cassandra.lucene.schema.analysis.StandardStopwordsTest.java
License:Apache License
@Test public void testGetEnglishPreBuiltAnalyzer() { CharArraySet stopwords = StandardStopwords.ENGLISH.get(); assertEquals("Expected another stopwords", EnglishAnalyzer.getDefaultStopSet(), stopwords); }
From source file:com.stratio.cassandra.lucene.schema.analysis.StandardStopwordsTest.java
License:Apache License
@Test public void testGetStandardStopwordsFromNameLowerCase() { CharArraySet stopwords = StandardStopwords.get("english"); assertEquals("Expected not null stopwords", stopwords, EnglishAnalyzer.getDefaultStopSet()); }
From source file:com.stratio.cassandra.lucene.schema.analysis.StandardStopwordsTest.java
License:Apache License
@Test public void testGetStandardStopwordsFromNameUpperCase() { CharArraySet stopwords = StandardStopwords.get("English"); assertEquals("Expected not null stopwords", stopwords, EnglishAnalyzer.getDefaultStopSet()); }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.ASCIIEnglishAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(result); result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null); result = new LowerCaseFilter(result); result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:ie.cmrc.smtx.lucene.analysis.EnglishKeywordAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(reader); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, EnglishAnalyzer.getDefaultStopSet()); filter = new KStemFilter(filter); //filter = new PorterStemFilter(filter); filter = new ASCIIFoldingFilter(filter); filter = new ConcatFilter(filter); return new Analyzer.TokenStreamComponents(source, filter); }
From source file:ie.cmrc.smtx.lucene.analysis.EuropeanAnalyzer.java
License:Apache License
/** * Gets the stop words set for the provided language * @param language Two-letter code of a language * @return {@code CharArraySet} containing the stop words of the provided language. * If the provided language is not supported,then the Lucene standard stop words set * if returned./*from w w w . j ava 2 s. co m*/ */ protected CharArraySet getStopWordsSet(String language) { String lang = language; if (lang != null) lang = lang.trim().toLowerCase(); CharArraySet charArraySet = cache.get(lang); if (charArraySet == null) { if (SUPPORTED_LANGUAGES.contains(lang)) { if (lang.equals(LANG_EN)) { charArraySet = EnglishAnalyzer.getDefaultStopSet(); } else if (lang.equals(LANG_FR)) { charArraySet = FrenchAnalyzer.getDefaultStopSet(); } else if (lang.equals(LANG_ES)) { charArraySet = SpanishAnalyzer.getDefaultStopSet(); } else if (lang.equals(LANG_PT)) { charArraySet = PortugueseAnalyzer.getDefaultStopSet(); } else if (lang.equals(LANG_IT)) { charArraySet = ItalianAnalyzer.getDefaultStopSet(); } else if (lang.equals(LANG_DE)) { charArraySet = GermanAnalyzer.getDefaultStopSet(); } else if (lang.equals(LANG_NO)) { charArraySet = NorwegianAnalyzer.getDefaultStopSet(); } } else { charArraySet = StandardAnalyzer.STOP_WORDS_SET; } cache.put(lang, charArraySet); } return charArraySet; }
From source file:lab_mri.CustomAnalyzer.java
@Override protected TokenStreamComponents createComponents(String string, Reader reader) { CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream filter = new ClassicFilter(tokenizer); filter = new StandardFilter(filter); filter = new StopFilter(filter, stopWords); filter = new PorterStemFilter(filter); filter = new LowerCaseFilter(filter); return new TokenStreamComponents(tokenizer, filter); }
From source file:org.crypto.sse.TextExtractPar.java
License:Open Source License
private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException { Multimap<String, String> lookup1 = ArrayListMultimap.create(); Multimap<String, String> lookup2 = ArrayListMultimap.create(); for (File file : listOfFile) { for (int j = 0; j < 100; j++) { if (counter == (int) ((j + 1) * listOfFile.length / 100)) { System.out.println("Number of files read equals " + j + " %"); break; }//ww w . ja v a2 s .c o m } List<String> lines = new ArrayList<String>(); counter++; FileInputStream fis = new FileInputStream(file); // ***********************************************************************************************// ///////////////////// .docx ///////////////////////////// // ***********************************************************************************************// if (file.getName().endsWith(".docx")) { XWPFDocument doc; try { // System.out.println("File read: "+file.getName()); doc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(doc); lines.add(ex.getText()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pptx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pptx")) { OPCPackage ppt; try { // System.out.println("File read: "+file.getName()); ppt = OPCPackage.open(fis); XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt); lines.add(xw.getText()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .xlsx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".xlsx")) { OPCPackage xls; try { // System.out.println("File read: "+file.getName()); xls = OPCPackage.open(fis); XSSFExcelExtractor xe = new XSSFExcelExtractor(xls); lines.add(xe.getText()); } catch (InvalidFormatException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { System.out.println("File not read: " + file.getName()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .doc ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".doc")) { NPOIFSFileSystem fs; try { // System.out.println("File read: "+file.getName()); fs = new NPOIFSFileSystem(file); WordExtractor extractor = new WordExtractor(fs.getRoot()); for (String rawText : extractor.getParagraphText()) { lines.add(extractor.stripFields(rawText)); } } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pdf ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pdf")) { PDFParser parser; try { // System.out.println("File read: "+file.getName()); parser = new PDFParser(fis); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); lines.add(stripper.getText(new PDDocument(cd))); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg, ///////////////////// .mp4 ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg") && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg") && file.getName().endsWith(".mp4")) { lines.add(file.getName()); } // ***********************************************************************************************// ///////////////////// raw text extensions ///////////////////// ///////////////////////////// // ***********************************************************************************************// else { try { // System.out.println("File read: "+file.getName()); lines = Files.readLines(file, Charsets.UTF_8); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } finally { try { fis.close(); } catch (IOException ioex) { // omitted. } } } // ***********************************************************************************************// ///////////////////// Begin word extraction ///////////////////// ///////////////////////////// // ***********************************************************************************************// int temporaryCounter = 0; // Filter threshold int counterDoc = 0; for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop // words. We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i)); temporaryCounter = temporaryCounter + token.size(); for (int j = 0; j < token.size(); j++) { // Avoid counting occurrences of words in the same file if (!lookup2.get(file.getName()).contains(token.get(j))) { lookup2.put(file.getName(), token.get(j)); } // Avoid counting occurrences of words in the same file if (!lookup1.get(token.get(j)).contains(file.getName())) { lookup1.put(token.get(j), file.getName()); } } } } // System.out.println(lookup.toString()); return new TextExtractPar(lookup1, lookup2); }