List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java
License:Open Source License
public void extractTerms(String text, Collection<String> termSet) throws IOException { if (text == null) return;/* w w w. j a va2s . c o m*/ StringReader reader = new StringReader(text); TokenStream ts = tokenStream(null, reader); try { ts = new TermSetTokenFilter(termSet, ts); while (ts.incrementToken()) ; } finally { IOUtils.closeQuietly(ts); } }
From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java
License:Open Source License
public void populate(String text, ResultNamedEntityExtraction result) throws IOException { if (text == null) return;/* w w w . j a v a2 s.co m*/ StringReader reader = new StringReader(text); TokenStream ts = tokenStream(null, reader); ts = new NamedEntityPopulateFilter(result, ts); try { while (ts.incrementToken()) ; } finally { IOUtils.closeQuietly(ts); } }
From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java
License:Open Source License
public void populate(String text, FieldContent fieldContent) throws IOException { if (text == null) return;// w w w .j a v a 2 s. c om StringReader reader = new StringReader(text); TokenStream ts = tokenStream(null, reader); ts = new FieldContentPopulateFilter(fieldContent, ts); try { while (ts.incrementToken()) ; } finally { IOUtils.closeQuietly(ts); } }
From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java
License:Open Source License
public void populate(String text, List<TokenTerm> tokenTerms) throws IOException { if (text == null) return;//from w w w. j a v a 2 s.c o m StringReader reader = new StringReader(text); TokenStream ts = tokenStream(null, reader); ts = new TokenTermPopulateFilter(tokenTerms, ts); try { while (ts.incrementToken()) ; } finally { IOUtils.closeQuietly(ts); } }
From source file:com.jamespot.glifpix.index.ResourceDocument.java
License:Open Source License
private void addLiteralField(String literal) throws IOException { _luceneDocument/*from www . j a v a2 s . co m*/ .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); String coolLiteral = literal.replaceAll("\\\"", ""); coolLiteral = replaceUnicodeStr(coolLiteral); Analyzer resAnalyzer = new ContentAnalyzer(); TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); int length = 0; StringBuffer sb = new StringBuffer(); while (ts.incrementToken()) { sb.append("_" + termAttribute.term()); length++; } sb.insert(0, length); _resourceLength = length; ts.end(); ts.close(); String finalToken = sb.toString(); _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS)); _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Map<String, Integer> getTagsFreq(String content, String lng) { Map<String, Integer> items = new HashMap<String, Integer>(); TokensArray tokArray = new TokensArray(_MaxExpressionLength); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {/* w w w . j a v a2s . co m*/ while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { String tag = _resStores.get(lng).getTag(s.getKey()); if (tag != null && tag.length() >= _MinWordLength) { if (items.containsKey(tag)) { items.put(tag, items.get(tag) + s.getValue()); } else { items.put(tag, s.getValue()); } } } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return items; }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Map<String, Float> getWeightedTagsFreq(String content, String lng) { Map<String, Float> items = new HashMap<String, Float>(); TokensArray tokArray = new TokensArray(_MaxExpressionLength); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {/* www . j av a2 s . com*/ while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { String tag = _resStores.get(lng).getTag(s.getKey()); if (tag != null && tag.length() >= _MinWordLength) { if (items.containsKey(tag)) { items.put(tag, items.get(tag) + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng)); } else { items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng)); } } } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return items; }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Set<String> getTokens(String content, String lng) { Set<String> tokens = new HashSet<String>(); TokensArray tokArray = new TokensArray(15); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {//from w w w . j a v a2s .com while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { tokens.add(s.getKey()); } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return tokens; }
From source file:com.leavesfly.lia.advsearching.SpanQueryTest.java
License:Apache License
private void dumpSpans(SpanQuery query) throws IOException { Spans spans = query.getSpans(reader); System.out.println(query + ":"); int numSpans = 0; TopDocs hits = searcher.search(query, 10); float[] scores = new float[2]; for (ScoreDoc sd : hits.scoreDocs) { scores[sd.doc] = sd.score;//from w ww .ja va2 s .c om } while (spans.next()) { // A numSpans++; int id = spans.doc(); Document doc = reader.document(id); // B TokenStream stream = analyzer.tokenStream("contents", // C new StringReader(doc.get("f"))); // C TermAttribute term = stream.addAttribute(TermAttribute.class); StringBuilder buffer = new StringBuilder(); buffer.append(" "); int i = 0; while (stream.incrementToken()) { // D if (i == spans.start()) { // E buffer.append("<"); // E } // E buffer.append(term.term()); // E if (i + 1 == spans.end()) { // E buffer.append(">"); // E } // E buffer.append(" "); i++; } buffer.append("(").append(scores[id]).append(") "); System.out.println(buffer); } if (numSpans == 0) { System.out.println(" No spans"); } System.out.println(); }
From source file:com.leavesfly.lia.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokens(TokenStream stream) throws IOException { TermAttribute term = stream.addAttribute(TermAttribute.class); while (stream.incrementToken()) { System.out.print("[" + term.term() + "] "); // B }// w ww .j a va 2 s . c o m }