List of usage examples for org.apache.lucene.analysis.shingle ShingleFilter ShingleFilter
public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize)
input From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.EnAnalyzer.java
License:Apache License
/** * * @param fieldName//from ww w .jav a 2 s . co m * @param reader * @return */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //reader = new HTMLStripCharFilter(reader); Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t; //result = new SynonymFilter(result, synonyms, true); result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.warn("No stopwordsfile provided, no stopword removal"); } //result = new LowerCaseFilter(Version.LUCENE_46, result); result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result); //result = new PorterStemFilter(result); result = new SnowballFilter(result, new EnglishStemmer()); ShingleFilter sf = new ShingleFilter(result, 2, 3); sf.setFillerToken(null); return new TokenStreamComponents(t, sf); }
From source file:ci6226.NGramAnalyzer.java
@Override protected TokenStreamComponents createComponents(String arg0, Reader reader) { Tokenizer source = new StandardTokenizer(Version.LUCENE_47, reader); TokenStream filter = new ShingleFilter(source, minGram, maxGram); // filter = new LowerCaseFilter(Version.LUCENE_47, filter); // filter = new StopFilter(Version.LUCENE_47, filter, // StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new TokenStreamComponents(source, filter); }
From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java
License:Apache License
protected TokenStream createTokenStream(String text) { Set<?> luceneStopWords = this.stopWords == null ? EnglishAnalyzer.getDefaultStopSet() : StopFilter.makeStopSet(LUCENE_VERSION, stopWords); Analyzer analyzer = new EnglishSpecialAnalyzer(LUCENE_VERSION, luceneStopWords, this.stemExclusionsSet); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); if (this.nGram) { tokenStream = new ShingleFilter(tokenStream, this.minNGram, this.maxNGram); }/*from w w w.ja v a2s.c o m*/ return tokenStream; }
From source file:com.NGramTokenBaseAnalyzer.java
public static ShingleFilter filter(TokenStream tok, boolean unigram) { ShingleFilter sf = new ShingleFilter(tok, NGramTokenBaseAnalyzer.min, NGramTokenBaseAnalyzer.max); sf.setOutputUnigrams(unigram);/* ww w . j a va 2 s. c om*/ return sf; }
From source file:eu.edisonproject.utility.text.processing.NGramGenerator.java
private String getNGrams() throws IOException { // List<String> words = new ArrayList<>(); Analyzer analyzer = new StandardAnalyzer(stopwords); TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(getDescription())); StopFilter stopFilter = new StopFilter(tokenStream, stopwords); StringBuilder words = new StringBuilder(); try (ShingleFilter sf = new ShingleFilter(stopFilter, 2, maxNGrams)) { sf.setOutputUnigrams(false);/*from w ww . jav a2s . c o m*/ CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class); sf.reset(); while (sf.incrementToken()) { String word = charTermAttribute.toString(); word = word.replaceAll("_", " "); word = word.replaceAll("\\s{2,}", " "); word = word.replaceAll(" ", "_"); words.append(word).append(" "); } sf.end(); } words.deleteCharAt(words.length() - 1); words.setLength(words.length()); return words.toString(); }
From source file:lucene.TestAnalyzer.java
protected TokenStreamComponents createComponents(String fieldName, Reader reader) { String token;/* w w w .j a va2s.c om*/ TokenStream result = null; // Tokenizer source = new WhitespaceTokenizer( Version.LUCENE_CURRENT, reader ); Tokenizer source = new WhitespaceTokenizer(); result = new ShingleFilter(source, 2, 2); return new TokenStreamComponents(source, result); }
From source file:nl.uva.p2psearch.Main.java
/** * * @param e s//from w w w.j av a 2 s. c o m * @return s * @throws IOException s */ private static List<InvertedIndexEntry> getInvertedIndexEntries(final MetadataEntry e) throws IOException { String text = e.getDescription(); Map<Number160, Integer> dictionary = new HashMap<>(); List<InvertedIndexEntry> list = new ArrayList<>(); Analyzer analyzer = new ArmenianAnalyzer(Version.LUCENE_42, Utils.getCharArrayStopwords()); try (TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text))) { PorterStemFilter psf = new PorterStemFilter(tokenStream); CharTermAttribute term = psf.addAttribute(CharTermAttribute.class); psf.reset(); StringBuilder sb = new StringBuilder(); while (psf.incrementToken()) { Integer tf; Number160 termKey = Number160.createHash(term.toString()); if (dictionary.containsKey(termKey)) { tf = dictionary.get(termKey); tf++; } else { tf = 1; } dictionary.put(termKey, tf); sb.append(term.toString()).append(" "); List<Number160> ll = new ArrayList<>(); ll.add(Number160.createHash(e.getID())); list.add(new InvertedIndexEntry(termKey.toString(), term.toString(), tf, ll)); } StandardTokenizer source = new StandardTokenizer(Version.LUCENE_42, new StringReader(sb.toString())); TokenStream tokenStreamSF = new StandardFilter(Version.LUCENE_42, source); try (ShingleFilter sf = new ShingleFilter(tokenStreamSF, 2, maxNGrams)) { sf.setOutputUnigrams(false); CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class); sf.reset(); while (sf.incrementToken()) { String word = charTermAttribute.toString(); String ng = word.replaceAll(" ", "_"); Integer tf; Number160 termKey = Number160.createHash(ng); if (dictionary.containsKey(termKey)) { tf = dictionary.get(termKey); tf++; } else { tf = 1; } dictionary.put(termKey, tf); List<Number160> ll = new ArrayList<>(); ll.add(Number160.createHash(e.getID())); list.add(new InvertedIndexEntry(termKey.toString(), ng, tf, ll)); } } } return list; }
From source file:nl.uva.sne.commons.SemanticUtils.java
public static List<String> getNGrams(String text, int maxNGrams) throws IOException { List<String> words = new ArrayList<>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42, CharArraySet.EMPTY_SET); TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)); StopFilter stopFilter = new StopFilter(Version.LUCENE_42, tokenStream, getStopWords()); stopFilter.setEnablePositionIncrements(false); // SnowballFilter snowballFilter = new SnowballFilter(stopFilter, "English"); try (ShingleFilter sf = new ShingleFilter(stopFilter, 2, maxNGrams)) { sf.setOutputUnigrams(false);//w w w . j a v a 2 s . co m CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class); sf.reset(); while (sf.incrementToken()) { String word = charTermAttribute.toString(); words.add(word.replaceAll(" ", "_")); } sf.end(); } return words; }
From source file:org.apache.james.mailbox.lucene.search.LenientImapSearchAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String arg0, Reader reader) { return new ShingleFilter(new UpperCaseFilter(new WhitespaceTokenizer(Version.LUCENE_31, reader)), 2, maxTokenLength);//from ww w. ja v a2 s. co m }
From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java
License:Apache License
private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) { Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(content)); tokenStream = new LowerCaseFilter(tokenizer); tokenStream = applyStemmer(stemFilterType); ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram); shingleFilter.setOutputUnigrams(false); tokenStream = (TokenStream) shingleFilter; return tokenStream; }