List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanTrigramQueryGeneratorTest.java
License:Open Source License
private void test_that_generated_trigram_query_match_the_referenced_query() throws IOException { Set<Term> queryTerms = new HashSet<Term>(); // Get stemmed question SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER); EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer(); TokenStream ts = ea.tokenStream("field", question); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();// w w w . j a va 2 s.co m List<String> stemmedQuestion = new ArrayList<String>(); while (ts.incrementToken()) stemmedQuestion.add(charTermAttribute.toString()); // get query terms spanTrigramQuery.extractTerms(queryTerms); BooleanClause[] clauses = ((BooleanQuery) spanTrigramQuery).getClauses(); SpanQuery[] qs; String term1, term2, term3; int numFields = clauses.length / (3 * stemmedQuestion.size() - 3); List<String> unigrams = new ArrayList<String>(); int idx = 0; // test trigrams int trigramidx = 0; for (idx = clauses.length - numFields * (stemmedQuestion.size() - 2); idx < clauses.length; idx++) { qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = trigramidx / numFields; term1 = ((SpanTermQuery) qs[0]).getTerm().text(); term2 = ((SpanTermQuery) qs[1]).getTerm().text(); term3 = ((SpanTermQuery) qs[2]).getTerm().text(); assertEquals("Extracted first term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); assertEquals("Extracted third term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 2), term3); trigramidx++; } // test bigrams int bigramidx = 0; for (idx = 0; idx < (2 * stemmedQuestion.size() - 1) * numFields; idx++) { Query q = clauses[idx].getQuery(); if (q instanceof SpanNearQuery) { qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = bigramidx / numFields; term1 = ((SpanTermQuery) qs[0]).getTerm().text(); term2 = ((SpanTermQuery) qs[1]).getTerm().text(); assertEquals("Extracted first term in the bigram doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term in the bigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); bigramidx++; } else if (q instanceof TermQuery) { unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text()); } else { assertTrue("Unknown type of query found!", false); } } // test unigrams for (String s : unigrams) assertTrue(stemmedQuestion.contains(s)); for (String s : stemmedQuestion) assertTrue(unigrams.contains(s)); }
From source file:com.ikon.analysis.AnalyzerDemo.java
License:Open Source License
/** * Analyze and display tokens/*from w w w . j a va 2 s .c om*/ */ private static void analyze(String string, Analyzer analyzer) throws IOException { StringBuffer buffer = new StringBuffer(); TokenStream stream = analyzer.tokenStream("contents", new StringReader(string)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); buffer.append(analyzer.getClass().getName()); buffer.append(" -> "); while (stream.incrementToken()) { buffer.append(" ["); buffer.append(term.toString()); buffer.append("]"); } String output = buffer.toString(); log.info(output); }
From source file:com.jamespot.glifpix.index.ResourceDocument.java
License:Open Source License
private void addLiteralField(String literal) throws IOException { _luceneDocument// w w w. j av a 2 s.c om .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); String coolLiteral = literal.replaceAll("\\\"", ""); coolLiteral = replaceUnicodeStr(coolLiteral); Analyzer resAnalyzer = new ContentAnalyzer(); TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); int length = 0; StringBuffer sb = new StringBuffer(); while (ts.incrementToken()) { sb.append("_" + termAttribute.term()); length++; } sb.insert(0, length); _resourceLength = length; ts.end(); ts.close(); String finalToken = sb.toString(); _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS)); _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Map<String, Integer> getTagsFreq(String content, String lng) { Map<String, Integer> items = new HashMap<String, Integer>(); TokensArray tokArray = new TokensArray(_MaxExpressionLength); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {/*from ww w. j av a 2 s . c o m*/ while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { String tag = _resStores.get(lng).getTag(s.getKey()); if (tag != null && tag.length() >= _MinWordLength) { if (items.containsKey(tag)) { items.put(tag, items.get(tag) + s.getValue()); } else { items.put(tag, s.getValue()); } } } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return items; }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Map<String, Float> getWeightedTagsFreq(String content, String lng) { Map<String, Float> items = new HashMap<String, Float>(); TokensArray tokArray = new TokensArray(_MaxExpressionLength); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {//from w w w . j av a2 s . co m while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { String tag = _resStores.get(lng).getTag(s.getKey()); if (tag != null && tag.length() >= _MinWordLength) { if (items.containsKey(tag)) { items.put(tag, items.get(tag) + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng)); } else { items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng)); } } } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return items; }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Set<String> getTokens(String content, String lng) { Set<String> tokens = new HashSet<String>(); TokensArray tokArray = new TokensArray(15); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {/*from w w w . j a v a2 s . c om*/ while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { tokens.add(s.getKey()); } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return tokens; }
From source file:com.leavesfly.lia.advsearching.SpanQueryTest.java
License:Apache License
private void dumpSpans(SpanQuery query) throws IOException { Spans spans = query.getSpans(reader); System.out.println(query + ":"); int numSpans = 0; TopDocs hits = searcher.search(query, 10); float[] scores = new float[2]; for (ScoreDoc sd : hits.scoreDocs) { scores[sd.doc] = sd.score;// w w w .j a v a 2 s . c om } while (spans.next()) { // A numSpans++; int id = spans.doc(); Document doc = reader.document(id); // B TokenStream stream = analyzer.tokenStream("contents", // C new StringReader(doc.get("f"))); // C TermAttribute term = stream.addAttribute(TermAttribute.class); StringBuilder buffer = new StringBuilder(); buffer.append(" "); int i = 0; while (stream.incrementToken()) { // D if (i == spans.start()) { // E buffer.append("<"); // E } // E buffer.append(term.term()); // E if (i + 1 == spans.end()) { // E buffer.append(">"); // E } // E buffer.append(" "); i++; } buffer.append("(").append(scores[id]).append(") "); System.out.println(buffer); } if (numSpans == 0) { System.out.println(" No spans"); } System.out.println(); }
From source file:com.leavesfly.lia.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokens(TokenStream stream) throws IOException { TermAttribute term = stream.addAttribute(TermAttribute.class); while (stream.incrementToken()) { System.out.print("[" + term.term() + "] "); // B }/* ww w. j ava 2 s . c o m*/ }
From source file:com.leavesfly.lia.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D System.out.println(); // #D System.out.print(position + ": "); // #D }//from w ww . ja v a 2 s . c om System.out.print("[" + // #E term.term() + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } System.out.println(); }
From source file:com.leavesfly.lia.analysis.Fragments.java
License:Apache License
public void frag3() throws Exception { Analyzer analyzer = null;// w w w . j av a 2s. com String text = null; // START TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); PositionIncrementAttribute posIncr = (PositionIncrementAttribute) stream .addAttribute(PositionIncrementAttribute.class); while (stream.incrementToken()) { System.out.println("posIncr=" + posIncr.getPositionIncrement()); } // END }