List of usage examples for org.apache.lucene.analysis.hy ArmenianAnalyzer ArmenianAnalyzer
public ArmenianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet)
From source file:nl.uva.p2psearch.Main.java
/** * * @param e s//from w ww. j av a2 s . c o m * @return s * @throws IOException s */ private static List<InvertedIndexEntry> getInvertedIndexEntries(final MetadataEntry e) throws IOException { String text = e.getDescription(); Map<Number160, Integer> dictionary = new HashMap<>(); List<InvertedIndexEntry> list = new ArrayList<>(); Analyzer analyzer = new ArmenianAnalyzer(Version.LUCENE_42, Utils.getCharArrayStopwords()); try (TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text))) { PorterStemFilter psf = new PorterStemFilter(tokenStream); CharTermAttribute term = psf.addAttribute(CharTermAttribute.class); psf.reset(); StringBuilder sb = new StringBuilder(); while (psf.incrementToken()) { Integer tf; Number160 termKey = Number160.createHash(term.toString()); if (dictionary.containsKey(termKey)) { tf = dictionary.get(termKey); tf++; } else { tf = 1; } dictionary.put(termKey, tf); sb.append(term.toString()).append(" "); List<Number160> ll = new ArrayList<>(); ll.add(Number160.createHash(e.getID())); list.add(new InvertedIndexEntry(termKey.toString(), term.toString(), tf, ll)); } StandardTokenizer source = new StandardTokenizer(Version.LUCENE_42, new StringReader(sb.toString())); TokenStream tokenStreamSF = new StandardFilter(Version.LUCENE_42, source); try (ShingleFilter sf = new ShingleFilter(tokenStreamSF, 2, maxNGrams)) { sf.setOutputUnigrams(false); CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class); sf.reset(); while (sf.incrementToken()) { String word = charTermAttribute.toString(); String ng = word.replaceAll(" ", "_"); Integer tf; Number160 termKey = Number160.createHash(ng); if (dictionary.containsKey(termKey)) { tf = dictionary.get(termKey); tf++; } else { tf = 1; } dictionary.put(termKey, tf); List<Number160> ll = new ArrayList<>(); ll.add(Number160.createHash(e.getID())); list.add(new InvertedIndexEntry(termKey.toString(), ng, tf, ll)); } } } return list; }
From source file:nl.uva.sne.commons.SemanticUtils.java
public static TokenStream tokenStream(String fieldName, Reader reader) throws IOException { Analyzer analyzer = new ArmenianAnalyzer(Version.LUCENE_42, getStopWords()); TokenStream stream = analyzer.tokenStream("field", reader); return stream; }
From source file:org.elasticsearch.analysis.common.ArmenianAnalyzerProvider.java
License:Apache License
ArmenianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new ArmenianAnalyzer(
Analysis.parseStopWords(env, settings, ArmenianAnalyzer.getDefaultStopSet()),
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
analyzer.setVersion(version);/*w ww . ja va 2 s . co m*/
}
From source file:org.omegat.tokenizer.LuceneArmenianTokenizer.java
License:Open Source License
@Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) { if (stemsAllowed) { Set<?> stopWords = stopWordsAllowed ? ArmenianAnalyzer.getDefaultStopSet() : Collections.EMPTY_SET; return new ArmenianAnalyzer(getBehavior(), stopWords).tokenStream("", new StringReader(strOrig)); } else {// w w w. j a v a2 s .c om return new StandardTokenizer(getBehavior(), new StringReader(strOrig)); } }