Example usage for org.apache.lucene.analysis.en EnglishAnalyzer getDefaultStopSet

List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer getDefaultStopSet

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.en EnglishAnalyzer getDefaultStopSet.

Prototype

public static CharArraySet getDefaultStopSet() 

Source Link

Document

Returns an unmodifiable instance of the default stop words set.

Usage

From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java

License:Apache License

protected TokenStream createTokenStream(String text) {
    Set<?> luceneStopWords = this.stopWords == null ? EnglishAnalyzer.getDefaultStopSet()
            : StopFilter.makeStopSet(LUCENE_VERSION, stopWords);
    Analyzer analyzer = new EnglishSpecialAnalyzer(LUCENE_VERSION, luceneStopWords, this.stemExclusionsSet);

    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
    if (this.nGram) {
        tokenStream = new ShingleFilter(tokenStream, this.minNGram, this.maxNGram);
    }//  ww  w  .  j  av a 2s  . c om

    return tokenStream;
}

From source file:com.stratio.cassandra.lucene.schema.analysis.SnowballAnalyzerBuilder.java

License:Apache License

/**
 * Returns the default stopwords set used by Lucene language analyzer for the specified language.
 *
 * @param language The language for which the stopwords are. The supported languages are English, French, Spanish,
 *                 Portuguese, Italian, Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish,
 *                 Irish, Hungarian, Turkish, Armenian, Basque and Catalan.
 * @return The default stopwords set used by Lucene language analyzers.
 *//*  w ww .  j a  v  a  2 s  .c  o m*/
private static CharArraySet getDefaultStopwords(String language) {
    switch (language) {
    case "English":
        return EnglishAnalyzer.getDefaultStopSet();
    case "French":
        return FrenchAnalyzer.getDefaultStopSet();
    case "Spanish":
        return SpanishAnalyzer.getDefaultStopSet();
    case "Portuguese":
        return PortugueseAnalyzer.getDefaultStopSet();
    case "Italian":
        return ItalianAnalyzer.getDefaultStopSet();
    case "Romanian":
        return RomanianAnalyzer.getDefaultStopSet();
    case "German":
        return GermanAnalyzer.getDefaultStopSet();
    case "Dutch":
        return DutchAnalyzer.getDefaultStopSet();
    case "Swedish":
        return SwedishAnalyzer.getDefaultStopSet();
    case "Norwegian":
        return NorwegianAnalyzer.getDefaultStopSet();
    case "Danish":
        return DanishAnalyzer.getDefaultStopSet();
    case "Russian":
        return RussianAnalyzer.getDefaultStopSet();
    case "Finnish":
        return FinnishAnalyzer.getDefaultStopSet();
    case "Irish":
        return IrishAnalyzer.getDefaultStopSet();
    case "Hungarian":
        return HungarianAnalyzer.getDefaultStopSet();
    case "Turkish":
        return SpanishAnalyzer.getDefaultStopSet();
    case "Armenian":
        return SpanishAnalyzer.getDefaultStopSet();
    case "Basque":
        return BasqueAnalyzer.getDefaultStopSet();
    case "Catalan":
        return CatalanAnalyzer.getDefaultStopSet();
    default:
        return CharArraySet.EMPTY_SET;
    }
}

From source file:com.stratio.cassandra.lucene.schema.analysis.StandardStopwordsTest.java

License:Apache License

@Test
public void testGetEnglishPreBuiltAnalyzer() {
    CharArraySet stopwords = StandardStopwords.ENGLISH.get();
    assertEquals("Expected another stopwords", EnglishAnalyzer.getDefaultStopSet(), stopwords);
}

From source file:com.stratio.cassandra.lucene.schema.analysis.StandardStopwordsTest.java

License:Apache License

@Test
public void testGetStandardStopwordsFromNameLowerCase() {
    CharArraySet stopwords = StandardStopwords.get("english");
    assertEquals("Expected not null stopwords", stopwords, EnglishAnalyzer.getDefaultStopSet());

}

From source file:com.stratio.cassandra.lucene.schema.analysis.StandardStopwordsTest.java

License:Apache License

@Test
public void testGetStandardStopwordsFromNameUpperCase() {
    CharArraySet stopwords = StandardStopwords.get("English");
    assertEquals("Expected not null stopwords", stopwords, EnglishAnalyzer.getDefaultStopSet());
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.ASCIIEnglishAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:ie.cmrc.smtx.lucene.analysis.EnglishKeywordAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer source = new StandardTokenizer(reader);
    TokenStream filter = new StandardFilter(source);
    filter = new LowerCaseFilter(filter);
    filter = new StopFilter(filter, EnglishAnalyzer.getDefaultStopSet());
    filter = new KStemFilter(filter);
    //filter = new PorterStemFilter(filter);
    filter = new ASCIIFoldingFilter(filter);
    filter = new ConcatFilter(filter);
    return new Analyzer.TokenStreamComponents(source, filter);
}

From source file:ie.cmrc.smtx.lucene.analysis.EuropeanAnalyzer.java

License:Apache License

/**
 * Gets the stop words set for the provided language
 * @param language Two-letter code of a language
 * @return {@code CharArraySet} containing the stop words of the provided language.
 * If the provided language is not supported,then the Lucene standard stop words set
 * if returned./*from  w  w w . j ava 2 s.  co m*/
 */
protected CharArraySet getStopWordsSet(String language) {
    String lang = language;
    if (lang != null)
        lang = lang.trim().toLowerCase();
    CharArraySet charArraySet = cache.get(lang);
    if (charArraySet == null) {
        if (SUPPORTED_LANGUAGES.contains(lang)) {
            if (lang.equals(LANG_EN)) {
                charArraySet = EnglishAnalyzer.getDefaultStopSet();
            } else if (lang.equals(LANG_FR)) {
                charArraySet = FrenchAnalyzer.getDefaultStopSet();
            } else if (lang.equals(LANG_ES)) {
                charArraySet = SpanishAnalyzer.getDefaultStopSet();
            } else if (lang.equals(LANG_PT)) {
                charArraySet = PortugueseAnalyzer.getDefaultStopSet();
            } else if (lang.equals(LANG_IT)) {
                charArraySet = ItalianAnalyzer.getDefaultStopSet();
            } else if (lang.equals(LANG_DE)) {
                charArraySet = GermanAnalyzer.getDefaultStopSet();
            } else if (lang.equals(LANG_NO)) {
                charArraySet = NorwegianAnalyzer.getDefaultStopSet();
            }
        } else {
            charArraySet = StandardAnalyzer.STOP_WORDS_SET;
        }
        cache.put(lang, charArraySet);
    }
    return charArraySet;
}

From source file:lab_mri.CustomAnalyzer.java

@Override
protected TokenStreamComponents createComponents(String string, Reader reader) {
    CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    Tokenizer tokenizer = new WikipediaTokenizer(reader);
    TokenStream filter = new ClassicFilter(tokenizer);
    filter = new StandardFilter(filter);
    filter = new StopFilter(filter, stopWords);
    filter = new PorterStemFilter(filter);

    filter = new LowerCaseFilter(filter);

    return new TokenStreamComponents(tokenizer, filter);
}

From source file:org.crypto.sse.TextExtractPar.java

License:Open Source License

private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException {

    Multimap<String, String> lookup1 = ArrayListMultimap.create();
    Multimap<String, String> lookup2 = ArrayListMultimap.create();

    for (File file : listOfFile) {

        for (int j = 0; j < 100; j++) {

            if (counter == (int) ((j + 1) * listOfFile.length / 100)) {
                System.out.println("Number of files read equals " + j + " %");
                break;
            }//ww  w  . ja v a2 s .c o  m
        }

        List<String> lines = new ArrayList<String>();
        counter++;
        FileInputStream fis = new FileInputStream(file);

        // ***********************************************************************************************//

        ///////////////////// .docx /////////////////////////////

        // ***********************************************************************************************//

        if (file.getName().endsWith(".docx")) {
            XWPFDocument doc;
            try {
                // System.out.println("File read: "+file.getName());

                doc = new XWPFDocument(fis);
                XWPFWordExtractor ex = new XWPFWordExtractor(doc);
                lines.add(ex.getText());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pptx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pptx")) {

            OPCPackage ppt;
            try {
                // System.out.println("File read: "+file.getName());

                ppt = OPCPackage.open(fis);
                XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt);
                lines.add(xw.getText());
            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .xlsx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".xlsx")) {

            OPCPackage xls;
            try {
                // System.out.println("File read: "+file.getName());

                xls = OPCPackage.open(fis);
                XSSFExcelExtractor xe = new XSSFExcelExtractor(xls);
                lines.add(xe.getText());
            } catch (InvalidFormatException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                System.out.println("File not read: " + file.getName());

            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .doc /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".doc")) {

            NPOIFSFileSystem fs;
            try {
                // System.out.println("File read: "+file.getName());

                fs = new NPOIFSFileSystem(file);
                WordExtractor extractor = new WordExtractor(fs.getRoot());
                for (String rawText : extractor.getParagraphText()) {
                    lines.add(extractor.stripFields(rawText));
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pdf /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pdf")) {

            PDFParser parser;
            try {
                // System.out.println("File read: "+file.getName());

                parser = new PDFParser(fis);
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                lines.add(stripper.getText(new PDDocument(cd)));

            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg,
        ///////////////////// .mp4 /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg")
                && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg")
                && file.getName().endsWith(".mp4")) {

            lines.add(file.getName());

        }

        // ***********************************************************************************************//

        ///////////////////// raw text extensions
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        else {
            try {
                // System.out.println("File read: "+file.getName());

                lines = Files.readLines(file, Charsets.UTF_8);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } finally {
                try {
                    fis.close();
                } catch (IOException ioex) {
                    // omitted.
                }
            }
        }

        // ***********************************************************************************************//

        ///////////////////// Begin word extraction
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        int temporaryCounter = 0;

        // Filter threshold
        int counterDoc = 0;
        for (int i = 0; i < lines.size(); i++) {

            CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();

            // We are using a standard tokenizer that eliminates the stop
            // words. We can use Stemming tokenizer such Porter
            // A set of English noise keywords is used that will eliminates
            // words such as "the, a, etc"

            Analyzer analyzer = new StandardAnalyzer(noise);
            List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i));
            temporaryCounter = temporaryCounter + token.size();
            for (int j = 0; j < token.size(); j++) {

                // Avoid counting occurrences of words in the same file
                if (!lookup2.get(file.getName()).contains(token.get(j))) {
                    lookup2.put(file.getName(), token.get(j));
                }

                // Avoid counting occurrences of words in the same file
                if (!lookup1.get(token.get(j)).contains(file.getName())) {
                    lookup1.put(token.get(j), file.getName());
                }

            }

        }

    }

    // System.out.println(lookup.toString());
    return new TextExtractPar(lookup1, lookup2);

}