List of usage examples for org.apache.lucene.analysis.core StopAnalyzer StopAnalyzer
public StopAnalyzer(Reader stopwords) throws IOException
From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java
public static List<String> tokenizeString(String linha) { Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46); List<String> result = new ArrayList<>(); try {//w ww .j a v a 2 s .c o m TokenStream stream = analyzer.tokenStream(null, new StringReader(linha)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } return result; }
From source file:com.tuplejump.stargate.lucene.AnalyzerFactory.java
License:Apache License
public static Analyzer getAnalyzer(String analyzerName, Version luceneV) { try {//from w w w . j a v a 2s . c o m Analyzers analyzer = Analyzers.valueOf(analyzerName); switch (analyzer) { case SimpleAnalyzer: { return new SimpleAnalyzer(luceneV); } case StandardAnalyzer: { return new StandardAnalyzer(luceneV); } case StopAnalyzer: { return new StopAnalyzer(luceneV); } case WhitespaceAnalyzer: { return new WhitespaceAnalyzer(luceneV); } case KeywordAnalyzer: { return new CaseInsensitiveKeywordAnalyzer(luceneV); } default: { return new StandardAnalyzer(luceneV); } } } catch (IllegalArgumentException e) { } return null; }
From source file:eu.eexcess.federatedrecommender.decomposer.PseudoRelevanceSourcesDecomposer.java
License:Open Source License
@Override public SecureUserProfile decompose(SecureUserProfileEvaluation inputSecureUserProfile) { FederatedRecommenderCore fCore = null; try {//from w w w . jav a 2s . c o m fCore = FederatedRecommenderCore.getInstance(null); } catch (FederatedRecommenderException e) { logger.log(Level.SEVERE, "Error getting FederatedRecommenderCore,was perhabs not initialized correctly", e); } Set<String> keywords = new HashSet<String>(); for (ContextKeyword cKeyword : inputSecureUserProfile.contextKeywords) { keywords.add(cKeyword.text); } // tmpSUP.partnerList = inputSecureUserProfile.queryExpansionSourcePartner; List<PartnerBadge> tmpPartnerList = new ArrayList<PartnerBadge>(); for (PartnerBadge partnerBadge : inputSecureUserProfile.partnerList) { tmpPartnerList.add(partnerBadge); } inputSecureUserProfile.partnerList = inputSecureUserProfile.queryExpansionSourcePartner; PartnersFederatedRecommendations pFR = fCore.getPartnersRecommendations(inputSecureUserProfile); inputSecureUserProfile.partnerList = tmpPartnerList; Directory directory = new RAMDirectory(); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_48); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer); IndexWriter writer = null; try { writer = new IndexWriter(directory, config); for (ResultList resultLists : pFR.getResults().values()) { for (Result result : resultLists.results) { addDoc(writer, result.description); addDoc(writer, result.title); } } writer.close(); IndexReader reader = DirectoryReader.open(directory); TermStats[] tStats = null; try { tStats = HighFreqTerms.getHighFreqTerms(reader, 20, "content", new DocFreqComparator()); } catch (Exception e) { logger.log(Level.SEVERE, "Could not open HighFreqTerms", e); } finally { reader.close(); } if (tStats != null) { for (TermStats termStats : tStats) { String utf8String = termStats.termtext.utf8ToString(); if (utf8String.length() > 4) if (!checkHighFreqTermsQuery(utf8String.toLowerCase(), keywords)) if (keywords.add(utf8String.toLowerCase())) { inputSecureUserProfile.contextKeywords.add(new ContextKeyword(utf8String, termStats.docFreq / 100.0, ExpansionType.EXPANSION)); } } } else logger.log(Level.SEVERE, "TermStats was null!"); } catch (IOException e) { logger.log(Level.SEVERE, "There was and error writing/reading the Index", e); } logger.log(Level.INFO, "Source Expansion: " + keywords.toString() + " Partners: " + inputSecureUserProfile.queryExpansionSourcePartner); return inputSecureUserProfile; }
From source file:lia.analysis.CopyOfAnalyzerDemo.java
License:Apache License
public static void main(String[] args) throws IOException { analyzers = new Analyzer[] { new WhitespaceAnalyzer(Version.LUCENE_46), new SimpleAnalyzer(Version.LUCENE_46), new StopAnalyzer(Version.LUCENE_46), new StandardAnalyzer(Version.LUCENE_46), new KeywordAnalyzer(), new KoreanAnalyzer(Version.LUCENE_46) }; String[] strings = examples;//from www. j a v a2s . c o m for (String text : strings) { analyze(text); } }
From source file:nl.cwi.helpers.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. * Can handle ngrams of any length and also perform stop word removal before extraction * @param text the text that the ngrams should be extracted from * @param minLength the minimum length of the ngrams * @param maxLength the maximum length of the ngrams * @param stopWords whether or not stopwords should be removed before extraction *//*www. j ava 2 s.co m*/ public void extract(String text, int minLength, int maxLength, Boolean stopWords) throws FileNotFoundException, IOException { this.text = text; this.minLength = minLength; this.maxLength = maxLength; this.stopWords = stopWords; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* If the minLength and maxLength are both 1, then we want unigrams * Make use of a StopAnalyzer when stopwords should be removed * Make use of a SimpleAnalyzer when stop words should be included */ if ((minLength == 1) && (maxLength == 1)) { if (this.stopWords) { analyzer = new StopAnalyzer(Version.LUCENE_43); } else { analyzer = new SimpleAnalyzer(Version.LUCENE_43); } } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_42), minLength, maxLength, " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_42), minLength, maxLength, " ", false, false); } } //Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); int tokenCount = 0; tokenStream.reset(); //System.out.println("So this is:" + charTermAttribute.toString() ); while (tokenStream.incrementToken()) { //System.out.println("Lets see"); int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); //The actual token term nGrams.add(termToken); //Add all ngrams to the ngram LinkedList } //Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }