List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:com.jamespot.glifpix.index.ResourceDocument.java
License:Open Source License
private void addLiteralField(String literal) throws IOException { _luceneDocument/* ww w . j a v a 2s. c o m*/ .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); String coolLiteral = literal.replaceAll("\\\"", ""); coolLiteral = replaceUnicodeStr(coolLiteral); Analyzer resAnalyzer = new ContentAnalyzer(); TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); int length = 0; StringBuffer sb = new StringBuffer(); while (ts.incrementToken()) { sb.append("_" + termAttribute.term()); length++; } sb.insert(0, length); _resourceLength = length; ts.end(); ts.close(); String finalToken = sb.toString(); _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS)); _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Map<String, Integer> getTagsFreq(String content, String lng) { Map<String, Integer> items = new HashMap<String, Integer>(); TokensArray tokArray = new TokensArray(_MaxExpressionLength); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {/*from w w w . j a v a2 s . c o m*/ while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { String tag = _resStores.get(lng).getTag(s.getKey()); if (tag != null && tag.length() >= _MinWordLength) { if (items.containsKey(tag)) { items.put(tag, items.get(tag) + s.getValue()); } else { items.put(tag, s.getValue()); } } } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return items; }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Map<String, Float> getWeightedTagsFreq(String content, String lng) { Map<String, Float> items = new HashMap<String, Float>(); TokensArray tokArray = new TokensArray(_MaxExpressionLength); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {// w ww . j a v a 2 s . c om while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { String tag = _resStores.get(lng).getTag(s.getKey()); if (tag != null && tag.length() >= _MinWordLength) { if (items.containsKey(tag)) { items.put(tag, items.get(tag) + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng)); } else { items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng)); } } } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return items; }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Set<String> getTokens(String content, String lng) { Set<String> tokens = new HashSet<String>(); TokensArray tokArray = new TokensArray(15); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {/*from w w w . ja va 2s .c om*/ while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { tokens.add(s.getKey()); } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return tokens; }
From source file:com.lorelib.analyzer.sample.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??//from w w w . j av a 2 s .c o m Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:com.mathworks.xzheng.analysis.AnalyzerUtils.java
License:Apache License
public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception { TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.toString()); }//from w w w .j a v a2s . c o m Assert.assertFalse(stream.incrementToken()); stream.close(); }
From source file:com.mhs.qsol.proximity.ProximityVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query./* ww w . j a v a2 s.c o m*/ * * @param token * @return */ protected Query tokenToQuery(String token) { if (logger.isLoggable(Level.FINE)) { // logger.fine("Query tokenToQuery(String token) : token:" + token); } if (logger.isLoggable(Level.FINE)) { logger.fine("Query tokenToQuery(String token) : token:" + token); } token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { return null; } else if (v.size() == 1) { t = v.get(0); SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); stq.setBoost(this.boost); return stq; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: SpanQuery[] spanQueries = new SpanQuery[v.size()]; StringBuilder regex = new StringBuilder(); for (int i = 0; i < v.size(); i++) { spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString())); } return new SpanOrQuery(spanQueries); } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length()))); } SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.mhs.qsol.QsolToQueryVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query.//w w w . ja v a 2 s . co m * * @param token * @return */ protected Query tokenToQuery(String token) { token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { // null's will get cleaned up in visitBooleanOp return null; } else if (v.size() == 1) { t = v.get(0); TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); termQuery.setBoost(this.boost); return termQuery; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = new BooleanQuery(true); for (int i = 0; i < v.size(); i++) { t = v.get(i); TermQuery currentQuery = new TermQuery( new Term(field, new String(t.buffer(), 0, t.length()))); currentQuery.setBoost(this.boost); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); SpanTermQuery spanQuery = new SpanTermQuery( new Term(field, new String(t2.buffer(), 0, t2.length()))); spanQuery.setBoost(boost); clauses[i] = spanQuery; } // Note: There's a bug here (not by me) that where term offsets are not respected. SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.nec.scg.senseRanking.CountTextSimilarity.java
public Map<String, Float> CountTF_IDF(String str, Analyzer a) { Map<String, Float> termVector = new TreeMap<String, Float>(); try {//w w w. j a va 2 s . c om TokenStream stream = a.tokenStream("content", new StringReader(str)); PorterStemFilter filter = new PorterStemFilter(stream); CharTermAttribute cta = filter.addAttribute(CharTermAttribute.class); filter.reset(); String strcat = null; int wordCount = 0; while (filter.incrementToken()) { strcat = cta.toString(); // System.out.print("["+strcat+"]"); if (!termVector.containsKey(strcat)) { termVector.put(strcat, 1f); wordCount++; } else { termVector.put(strcat, termVector.get(strcat) + 1); wordCount++; } } for (String ter : termVector.keySet()) { int hits = searchIndexforIDF(ter) + 1; float idf = (float) (Math.log(AllArticle * 1.0 / hits) + 1.0); float tf = termVector.get(ter) / wordCount; termVector.put(ter, tf * idf); } filter.end(); stream.end(); filter.close(); stream.close(); } catch (IOException e) { e.printStackTrace(); } return termVector; }
From source file:com.plug.Version_8_5_2.gs.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string using * GsAnalyzer./*from w w w . j ava 2s . com*/ * * @param p_text * fuzzy match format string * @return List of c.g.l.tm2.index.Tokens */ public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); // GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class); // org.apache.lucene.analysis.Token luceneToken = null; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { // luceneToken = gsAtt.getToken(); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return buildTokenList(tokens); }