List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer tokenStream
public final TokenStream tokenStream(final String fieldName, final Reader reader)
fieldName, tokenizing the contents of reader. From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java
License:Open Source License
private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException { // Get stemmed question SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER); EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer(); TokenStream ts = ea.tokenStream("field", question); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();// w ww.jav a 2 s . co m List<String> stemmedQuestion = new ArrayList<String>(); while (ts.incrementToken()) stemmedQuestion.add(charTermAttribute.toString()); // get query terms BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses(); SpanQuery[] queries; String term1, term2; List<String> unigrams = new ArrayList<String>(); int numFields = clauses.length / (2 * stemmedQuestion.size() - 1); // test bigrams int bigramidx = 0; for (int idx = 0; idx < clauses.length; idx++) { Query q = clauses[idx].getQuery(); if (q instanceof SpanNearQuery) { queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = bigramidx / numFields; term1 = ((SpanTermQuery) queries[0]).getTerm().text(); term2 = ((SpanTermQuery) queries[1]).getTerm().text(); assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); bigramidx++; } else if (q instanceof TermQuery) { unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text()); } else { assertTrue("Unknown type of query found!", false); } } // test unigrams for (String s : unigrams) assertTrue(stemmedQuestion.contains(s)); for (String s : stemmedQuestion) assertTrue(unigrams.contains(s)); }
From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanTrigramQueryGeneratorTest.java
License:Open Source License
private void test_that_generated_trigram_query_match_the_referenced_query() throws IOException { Set<Term> queryTerms = new HashSet<Term>(); // Get stemmed question SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER); EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer(); TokenStream ts = ea.tokenStream("field", question); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();/* w w w .j a va 2 s . com*/ List<String> stemmedQuestion = new ArrayList<String>(); while (ts.incrementToken()) stemmedQuestion.add(charTermAttribute.toString()); // get query terms spanTrigramQuery.extractTerms(queryTerms); BooleanClause[] clauses = ((BooleanQuery) spanTrigramQuery).getClauses(); SpanQuery[] qs; String term1, term2, term3; int numFields = clauses.length / (3 * stemmedQuestion.size() - 3); List<String> unigrams = new ArrayList<String>(); int idx = 0; // test trigrams int trigramidx = 0; for (idx = clauses.length - numFields * (stemmedQuestion.size() - 2); idx < clauses.length; idx++) { qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = trigramidx / numFields; term1 = ((SpanTermQuery) qs[0]).getTerm().text(); term2 = ((SpanTermQuery) qs[1]).getTerm().text(); term3 = ((SpanTermQuery) qs[2]).getTerm().text(); assertEquals("Extracted first term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); assertEquals("Extracted third term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 2), term3); trigramidx++; } // test bigrams int bigramidx = 0; for (idx = 0; idx < (2 * stemmedQuestion.size() - 1) * numFields; idx++) { Query q = clauses[idx].getQuery(); if (q instanceof SpanNearQuery) { qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = bigramidx / numFields; term1 = ((SpanTermQuery) qs[0]).getTerm().text(); term2 = ((SpanTermQuery) qs[1]).getTerm().text(); assertEquals("Extracted first term in the bigram doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term in the bigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); bigramidx++; } else if (q instanceof TermQuery) { unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text()); } else { assertTrue("Unknown type of query found!", false); } } // test unigrams for (String s : unigrams) assertTrue(stemmedQuestion.contains(s)); for (String s : stemmedQuestion) assertTrue(unigrams.contains(s)); }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java
License:Open Source License
protected void computeTFIDF(List<TFIDFTerm> wordList, int totalWordsDoc) { if (reader != null && searcher != null) { double tf; double idf; double tfidf; EnglishAnalyzer analyzer = new EnglishAnalyzer(Version.LUCENE_40); TokenStream stream = null;/*from w w w.ja v a 2s. c o m*/ CharTermAttribute termAtt; String term; double totalWikiDocs = (double) reader.numDocs(); for (TFIDFTerm word : wordList) { try { term = ""; stream = analyzer.tokenStream("field", new StringReader(word.word)); termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { term += (termAtt.toString()); } // System.out.println(term); stream.end(); tf = (double) word.count / (double) totalWordsDoc; double wikiTermFrec = reader.docFreq(new Term("contents", term)); if (wikiTermFrec != 0) { idf = Math.log(totalWikiDocs / wikiTermFrec); tfidf = tf * idf; } else { tfidf = 0; } word.tfidf = tfidf; } catch (IOException ex) { logger.error("Error processing the TFIDF", ex); } finally { try { if (stream != null) { stream.close(); } } catch (IOException ex) { logger.error("Error processing the TFIDF", ex); } } } try { reader.close(); } catch (IOException ex) { logger.warn("Error closing lucene reader", ex); } } }