Example usage for org.apache.lucene.analysis.core StopAnalyzer StopAnalyzer

List of usage examples for org.apache.lucene.analysis.core StopAnalyzer StopAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core StopAnalyzer StopAnalyzer.

Prototype

public StopAnalyzer(Reader stopwords) throws IOException 

Source Link

Document

Builds an analyzer with the stop words from the given reader.

Usage

From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46);

    List<String> result = new ArrayList<>();

    try {//w ww  .j  a  v  a  2  s .c  o  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:com.tuplejump.stargate.lucene.AnalyzerFactory.java

License:Apache License

public static Analyzer getAnalyzer(String analyzerName, Version luceneV) {
    try {//from w  w  w  .  j  a  v  a  2s  .  c o m
        Analyzers analyzer = Analyzers.valueOf(analyzerName);
        switch (analyzer) {
        case SimpleAnalyzer: {
            return new SimpleAnalyzer(luceneV);
        }
        case StandardAnalyzer: {
            return new StandardAnalyzer(luceneV);
        }
        case StopAnalyzer: {
            return new StopAnalyzer(luceneV);
        }
        case WhitespaceAnalyzer: {
            return new WhitespaceAnalyzer(luceneV);
        }
        case KeywordAnalyzer: {
            return new CaseInsensitiveKeywordAnalyzer(luceneV);
        }
        default: {
            return new StandardAnalyzer(luceneV);
        }
        }
    } catch (IllegalArgumentException e) {

    }
    return null;
}

From source file:eu.eexcess.federatedrecommender.decomposer.PseudoRelevanceSourcesDecomposer.java

License:Open Source License

@Override
public SecureUserProfile decompose(SecureUserProfileEvaluation inputSecureUserProfile) {
    FederatedRecommenderCore fCore = null;

    try {//from  w  w  w .  jav  a 2s  .  c  o m
        fCore = FederatedRecommenderCore.getInstance(null);
    } catch (FederatedRecommenderException e) {
        logger.log(Level.SEVERE, "Error getting FederatedRecommenderCore,was perhabs not initialized correctly",
                e);
    }
    Set<String> keywords = new HashSet<String>();
    for (ContextKeyword cKeyword : inputSecureUserProfile.contextKeywords) {
        keywords.add(cKeyword.text);
    }
    //   tmpSUP.partnerList = inputSecureUserProfile.queryExpansionSourcePartner;
    List<PartnerBadge> tmpPartnerList = new ArrayList<PartnerBadge>();
    for (PartnerBadge partnerBadge : inputSecureUserProfile.partnerList) {
        tmpPartnerList.add(partnerBadge);
    }
    inputSecureUserProfile.partnerList = inputSecureUserProfile.queryExpansionSourcePartner;
    PartnersFederatedRecommendations pFR = fCore.getPartnersRecommendations(inputSecureUserProfile);
    inputSecureUserProfile.partnerList = tmpPartnerList;

    Directory directory = new RAMDirectory();

    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_48);
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);
    IndexWriter writer = null;

    try {
        writer = new IndexWriter(directory, config);
        for (ResultList resultLists : pFR.getResults().values()) {
            for (Result result : resultLists.results) {
                addDoc(writer, result.description);
                addDoc(writer, result.title);
            }
        }

        writer.close();

        IndexReader reader = DirectoryReader.open(directory);
        TermStats[] tStats = null;
        try {
            tStats = HighFreqTerms.getHighFreqTerms(reader, 20, "content", new DocFreqComparator());
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Could not open HighFreqTerms", e);
        } finally {
            reader.close();
        }
        if (tStats != null) {
            for (TermStats termStats : tStats) {
                String utf8String = termStats.termtext.utf8ToString();
                if (utf8String.length() > 4)
                    if (!checkHighFreqTermsQuery(utf8String.toLowerCase(), keywords))
                        if (keywords.add(utf8String.toLowerCase())) {
                            inputSecureUserProfile.contextKeywords.add(new ContextKeyword(utf8String,
                                    termStats.docFreq / 100.0, ExpansionType.EXPANSION));
                        }
            }
        } else
            logger.log(Level.SEVERE, "TermStats was null!");
    } catch (IOException e) {
        logger.log(Level.SEVERE, "There was and error writing/reading the Index", e);
    }

    logger.log(Level.INFO, "Source   Expansion: " + keywords.toString() + " Partners: "
            + inputSecureUserProfile.queryExpansionSourcePartner);
    return inputSecureUserProfile;
}

From source file:lia.analysis.CopyOfAnalyzerDemo.java

License:Apache License

public static void main(String[] args) throws IOException {
    analyzers = new Analyzer[] { new WhitespaceAnalyzer(Version.LUCENE_46),
            new SimpleAnalyzer(Version.LUCENE_46), new StopAnalyzer(Version.LUCENE_46),
            new StandardAnalyzer(Version.LUCENE_46), new KeywordAnalyzer(),
            new KoreanAnalyzer(Version.LUCENE_46) };

    String[] strings = examples;//from   www.  j a v  a2s . c o  m

    for (String text : strings) {
        analyze(text);
    }
}

From source file:nl.cwi.helpers.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text.
 * Can handle ngrams of any length and also perform stop word removal before extraction
 * @param text the text that the ngrams should be extracted from
 * @param minLength the minimum length of the ngrams
 * @param maxLength the maximum length of the ngrams
 * @param stopWords whether or not stopwords should be removed before extraction
 *//*www.  j  ava  2 s.co  m*/
public void extract(String text, int minLength, int maxLength, Boolean stopWords)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.minLength = minLength;
    this.maxLength = maxLength;
    this.stopWords = stopWords;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /* If the minLength and maxLength are both 1, then we want unigrams
     * Make use of a StopAnalyzer when stopwords should be removed
     * Make use of a SimpleAnalyzer when stop words should be included
     */
    if ((minLength == 1) && (maxLength == 1)) {
        if (this.stopWords) {
            analyzer = new StopAnalyzer(Version.LUCENE_43);
        } else {
            analyzer = new SimpleAnalyzer(Version.LUCENE_43);
        }
    }

    else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_42), minLength, maxLength,
                    " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_42), minLength, maxLength,
                    " ", false, false);
        }
    }

    //Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    int tokenCount = 0;
    tokenStream.reset();
    //System.out.println("So this is:" + charTermAttribute.toString() );

    while (tokenStream.incrementToken()) {
        //System.out.println("Lets see");
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); //The actual token term
        nGrams.add(termToken); //Add all ngrams to the ngram LinkedList

    }

    //Store unique nGrams and frequencies in hash tables

    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}