List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:io.anserini.analysis.TweetTokenizationTest.java
License:Apache License
public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException { List<String> list = new ArrayList<>(); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords)); CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();//from w w w . jav a2 s. com while (tokenStream.incrementToken()) { if (cattr.toString().length() == 0) { continue; } list.add(cattr.toString()); } tokenStream.end(); tokenStream.close(); return list; }
From source file:it.cnr.ilc.lc.clavius.search.ClaviusHighlighter.java
public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, String idDoc, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { List<Annotation> ret = new ArrayList<>(); ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>(); StringBuilder newText = new StringBuilder(); Scorer fragmentScorer = getFragmentScorer(); Fragmenter textFragmenter = getTextFragmenter(); int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze(); Encoder encoder = getEncoder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE); }/* w ww. ja v a 2s . c om*/ TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); // FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; //textFragmenter.start(text, tokenStream); ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream); tokenStream.reset(); // log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens()); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { // if ((offsetAtt.endOffset() > text.length()) // || (offsetAtt.startOffset() > text.length())) { // throw new InvalidTokenOffsetsException("Token " + termAtt.toString() // + " exceeds length of provided text sized " + text.length()); // } // log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")"); tokenGroup.addToken(fragmentScorer.getTokenScore()); } // END FOR // log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens()); for (int i = 0; i < tokenGroup.getNumTokens(); i++) { //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i)); if (tokenGroup.getScore(i) > 0) { Annotation a = new Annotation(); a.setMatched(tokenGroup.getToken(i).toString()); a.setIdDoc(idDoc); //contesto sinistro Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i); StringBuilder sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setLeftContext(sb.toString()); sb.setLength(0); //contesto destro t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1, (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1 : tokenGroup.getNumTokens())); sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setRightContext(sb.toString()); a.setConcept(""); a.setType(""); a.setIdNeo4j(-1l); a.setPageNum(-1l); a.setResourceObject(""); a.setId(-1l); ret.add(a); } } return ret; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:it.cnr.ilc.lc.claviusweb.fulltextsearch.ClaviusHighlighter.java
public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, Document document, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { List<Annotation> ret = new ArrayList<>(); ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>(); StringBuilder newText = new StringBuilder(); Scorer fragmentScorer = getFragmentScorer(); Fragmenter textFragmenter = getTextFragmenter(); int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze(); Encoder encoder = getEncoder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE); }/*from w ww .j a v a 2 s. c o m*/ TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); // FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; //textFragmenter.start(text, tokenStream); ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream); tokenStream.reset(); //log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens()); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { // if ((offsetAtt.endOffset() > text.length()) // || (offsetAtt.startOffset() > text.length())) { // throw new InvalidTokenOffsetsException("Token " + termAtt.toString() // + " exceeds length of provided text sized " + text.length()); // } // log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")"); tokenGroup.addToken(fragmentScorer.getTokenScore()); } // END FOR //log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens()); for (int i = 0; i < tokenGroup.getNumTokens(); i++) { //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i)); if (tokenGroup.getScore(i) > 0) { Annotation a = new Annotation(); a.setMatched(tokenGroup.getToken(i).toString()); a.setIdDoc(document.get("idDoc")); //contesto sinistro Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i); StringBuilder sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setLeftContext(sb.toString()); sb.setLength(0); //contesto destro t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1, (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1 : tokenGroup.getNumTokens())); sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setRightContext(sb.toString()); a.setConcept(""); a.setType(""); a.setPageNum(-1l); // a.setIdNeo4j(Long.parseLong(document.get("idNeo4j"))); a.setIdNeo4j(Long.parseLong(document.get("idDoc"))); a.setResourceObject(""); a.setId(-1l); ret.add(a); } } return ret; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:it.unibz.instasearch.indexing.StorageIndexer.java
License:Open Source License
/** * Extracts terms from text// ww w .j av a2s. c om * * @param text * @return a map of terms to their offsets in text * @throws IOException */ public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException { Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>(); TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text)); TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class); while (tokenStream.incrementToken()) { String termText = termAtt.term().toLowerCase();// t.termText().toLowerCase(); int offset = offsetAtt.startOffset(); List<Integer> offsets = terms.get(termText); if (offsets == null) { offsets = new LinkedList<Integer>(); terms.put(termText, offsets); } offsets.add(offset); } tokenStream.close(); return terms; }
From source file:jaligner.Sequence.java
License:Open Source License
/** * Constructor/*w w w . j av a 2 s. com*/ * * @param sequence */ public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException { super(); this.sequence = sequence; TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence)); Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory( stream.getAttributeFactory()); Vector<Token> tokenVector = new Vector<Token>(); while (stream.incrementToken() && tokenVector.size() < max_length) { // Token token = new Token(); // Token token = (Token) stream.getAttribute(CharTermAttribute.class); Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class); CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class); // PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class); // FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class); // public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset()); token.setOffset(offset.startOffset(), offset.endOffset()); // token.setPayload(payload.getPayload()); // token.setFlags(flags.getFlags()); if (stream.hasAttribute(PositionIncrementAttribute.class)) { PositionIncrementAttribute positionIncrement = stream .getAttribute(PositionIncrementAttribute.class); token.setPositionIncrement(positionIncrement.getPositionIncrement()); } if (stream.hasAttribute(TypeAttribute.class)) { TypeAttribute type = stream.getAttribute(TypeAttribute.class); token.setType(type.type()); } tokenVector.add(token); } stream.end(); stream.close(); this.tokens = tokenVector.toArray(new Token[tokenVector.size()]); }
From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java
License:Apache License
private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException { if (!analyzeWildcard) { return super.getPrefixQuery(field, termStr); }/*from w w w .ja v a2 s .co m*/ // get Analyzer from superclass and tokenize the term TokenStream source; try { source = getAnalyzer().tokenStream(field, termStr); source.reset(); } catch (IOException e) { return super.getPrefixQuery(field, termStr); } List<String> tlist = new ArrayList<>(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); while (true) { try { if (!source.incrementToken()) break; } catch (IOException e) { break; } tlist.add(termAtt.toString()); } try { source.close(); } catch (IOException e) { // ignore } if (tlist.size() == 1) { return super.getPrefixQuery(field, tlist.get(0)); } else { // build a boolean query with prefix on each one... List<BooleanClause> clauses = new ArrayList<>(); for (String token : tlist) { clauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses, true); //return super.getPrefixQuery(field, termStr); /* this means that the analyzer used either added or consumed * (common for a stemmer) tokens, and we can't build a PrefixQuery */ // throw new ParseException("Cannot build PrefixQuery with analyzer " // + getAnalyzer().getClass() // + (tlist.size() > 1 ? " - token(s) added" : " - token consumed")); } }
From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java
License:Apache License
private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException { if (!analyzeWildcard) { return super.getWildcardQuery(field, termStr); }/*from w ww . j a v a 2s .c om*/ boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*")); StringBuilder aggStr = new StringBuilder(); StringBuilder tmp = new StringBuilder(); for (int i = 0; i < termStr.length(); i++) { char c = termStr.charAt(i); if (c == '?' || c == '*') { if (isWithinToken) { try { TokenStream source = getAnalyzer().tokenStream(field, tmp.toString()); source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); if (source.incrementToken()) { String term = termAtt.toString(); if (term.length() == 0) { // no tokens, just use what we have now aggStr.append(tmp); } else { aggStr.append(term); } } else { // no tokens, just use what we have now aggStr.append(tmp); } source.close(); } catch (IOException e) { aggStr.append(tmp); } tmp.setLength(0); } isWithinToken = false; aggStr.append(c); } else { tmp.append(c); isWithinToken = true; } } if (isWithinToken) { try { TokenStream source = getAnalyzer().tokenStream(field, tmp.toString()); source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); if (source.incrementToken()) { String term = termAtt.toString(); if (term.length() == 0) { // no tokens, just use what we have now aggStr.append(tmp); } else { aggStr.append(term); } } else { // no tokens, just use what we have now aggStr.append(tmp); } source.close(); } catch (IOException e) { aggStr.append(tmp); } } return super.getWildcardQuery(field, aggStr.toString()); }
From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerTest.java
License:Apache License
@Test public void testNullSynonyms() throws Exception { Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1); TokenStream stream = a.tokenStream("f", new StringReader("")); stream.reset();/* w ww . j a v a2 s .co m*/ assertTokenStream(stream, ",0,1,1/,1,2,1/,2,3,1/,3,4,1/,4,5,1/,5,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(2); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,2,1/,1,3,1/,2,4,1/,3,5,1/,4,6,1"); stream.close(); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,1,1"); stream.close(); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,2,1"); a = new NGramSynonymTokenizerTestAnalyzer(3); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,3,1/,1,4,1/,2,5,1/,3,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(4); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,4,1/,1,5,1/,2,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(5); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,5,1/,1,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(6); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(7); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(8); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); }
From source file:kafka.examples.Producer.java
License:Apache License
public void run() { while (true) { String access_token = "2.009F1d9BmHHChD7abcd6de0a0jui5Y"; int count = 20; Timeline tm = new Timeline(access_token); Analyzer analyzer4 = new IKAnalyzer(false);// ? try {// w w w . ja v a 2s .co m StatusWapper status = tm.getPublicTimeline(count, 0); //------------------------------------------- try { TokenStream tokenstream = analyzer4.tokenStream("", new StringReader(status.toString())); CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);// token tokenstream.reset();// ? while (tokenstream.incrementToken()) {// ??token String prTxt = new String(termAttribute.buffer(), 0, termAttribute.length()); //producer.send(new KeyedMessage<Integer, String>(topic, ptTxt + " ")); System.out.print(prTxt + " "); } //System.out.println(); tokenstream.close();//TokenStream } catch (IOException e) { e.printStackTrace(); } //------------------------------------------- producer.send(new KeyedMessage<Integer, String>(topic, status.toString())); Log.logInfo(status.toString()); } catch (WeiboException e) { e.printStackTrace(); } } }
From source file:lia.chapter4.AnalyzerUtils.java
License:Apache License
public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception { TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.getBytesRef().utf8ToString()); }/* w w w .j av a 2 s . c om*/ Assert.assertFalse(stream.incrementToken()); stream.close(); }