Example usage for org.apache.lucene.analysis.hy ArmenianAnalyzer ArmenianAnalyzer

List of usage examples for org.apache.lucene.analysis.hy ArmenianAnalyzer ArmenianAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.hy ArmenianAnalyzer ArmenianAnalyzer.

Prototype

public ArmenianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) 

Source Link

Document

Builds an analyzer with the given stop words.

Usage

From source file:nl.uva.p2psearch.Main.java

/**
 *
 * @param e s//from   w ww. j  av a2  s  .  c  o m
 * @return s
 * @throws IOException s
 */
private static List<InvertedIndexEntry> getInvertedIndexEntries(final MetadataEntry e) throws IOException {
    String text = e.getDescription();
    Map<Number160, Integer> dictionary = new HashMap<>();
    List<InvertedIndexEntry> list = new ArrayList<>();

    Analyzer analyzer = new ArmenianAnalyzer(Version.LUCENE_42, Utils.getCharArrayStopwords());
    try (TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text))) {
        PorterStemFilter psf = new PorterStemFilter(tokenStream);
        CharTermAttribute term = psf.addAttribute(CharTermAttribute.class);
        psf.reset();
        StringBuilder sb = new StringBuilder();
        while (psf.incrementToken()) {
            Integer tf;
            Number160 termKey = Number160.createHash(term.toString());
            if (dictionary.containsKey(termKey)) {
                tf = dictionary.get(termKey);
                tf++;
            } else {
                tf = 1;
            }
            dictionary.put(termKey, tf);
            sb.append(term.toString()).append(" ");
            List<Number160> ll = new ArrayList<>();
            ll.add(Number160.createHash(e.getID()));
            list.add(new InvertedIndexEntry(termKey.toString(), term.toString(), tf, ll));
        }
        StandardTokenizer source = new StandardTokenizer(Version.LUCENE_42, new StringReader(sb.toString()));
        TokenStream tokenStreamSF = new StandardFilter(Version.LUCENE_42, source);
        try (ShingleFilter sf = new ShingleFilter(tokenStreamSF, 2, maxNGrams)) {
            sf.setOutputUnigrams(false);
            CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class);
            sf.reset();
            while (sf.incrementToken()) {
                String word = charTermAttribute.toString();

                String ng = word.replaceAll(" ", "_");
                Integer tf;
                Number160 termKey = Number160.createHash(ng);
                if (dictionary.containsKey(termKey)) {
                    tf = dictionary.get(termKey);
                    tf++;
                } else {
                    tf = 1;
                }
                dictionary.put(termKey, tf);
                List<Number160> ll = new ArrayList<>();
                ll.add(Number160.createHash(e.getID()));
                list.add(new InvertedIndexEntry(termKey.toString(), ng, tf, ll));
            }
        }
    }
    return list;
}

From source file:nl.uva.sne.commons.SemanticUtils.java

public static TokenStream tokenStream(String fieldName, Reader reader) throws IOException {
    Analyzer analyzer = new ArmenianAnalyzer(Version.LUCENE_42, getStopWords());
    TokenStream stream = analyzer.tokenStream("field", reader);
    return stream;
}

From source file:org.elasticsearch.analysis.common.ArmenianAnalyzerProvider.java

License:Apache License

ArmenianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new ArmenianAnalyzer(
            Analysis.parseStopWords(env, settings, ArmenianAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);/*w  ww . ja va  2  s . co  m*/
}

From source file:org.omegat.tokenizer.LuceneArmenianTokenizer.java

License:Open Source License

@Override
protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed,
        final boolean stopWordsAllowed) {
    if (stemsAllowed) {
        Set<?> stopWords = stopWordsAllowed ? ArmenianAnalyzer.getDefaultStopSet() : Collections.EMPTY_SET;
        return new ArmenianAnalyzer(getBehavior(), stopWords).tokenStream("", new StringReader(strOrig));
    } else {//  w  w w.  j  a v  a2 s .c  om
        return new StandardTokenizer(getBehavior(), new StringReader(strOrig));
    }
}