List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:it.cnr.ilc.lc.clavius.search.ClaviusHighlighter.java
public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, String idDoc, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { List<Annotation> ret = new ArrayList<>(); ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>(); StringBuilder newText = new StringBuilder(); Scorer fragmentScorer = getFragmentScorer(); Fragmenter textFragmenter = getTextFragmenter(); int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze(); Encoder encoder = getEncoder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE); }//w ww . java 2s . co m TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); // FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; //textFragmenter.start(text, tokenStream); ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream); tokenStream.reset(); // log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens()); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { // if ((offsetAtt.endOffset() > text.length()) // || (offsetAtt.startOffset() > text.length())) { // throw new InvalidTokenOffsetsException("Token " + termAtt.toString() // + " exceeds length of provided text sized " + text.length()); // } // log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")"); tokenGroup.addToken(fragmentScorer.getTokenScore()); } // END FOR // log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens()); for (int i = 0; i < tokenGroup.getNumTokens(); i++) { //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i)); if (tokenGroup.getScore(i) > 0) { Annotation a = new Annotation(); a.setMatched(tokenGroup.getToken(i).toString()); a.setIdDoc(idDoc); //contesto sinistro Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i); StringBuilder sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setLeftContext(sb.toString()); sb.setLength(0); //contesto destro t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1, (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1 : tokenGroup.getNumTokens())); sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setRightContext(sb.toString()); a.setConcept(""); a.setType(""); a.setIdNeo4j(-1l); a.setPageNum(-1l); a.setResourceObject(""); a.setId(-1l); ret.add(a); } } return ret; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:it.cnr.ilc.lc.claviusweb.fulltextsearch.ClaviusHighlighter.java
public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, Document document, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { List<Annotation> ret = new ArrayList<>(); ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>(); StringBuilder newText = new StringBuilder(); Scorer fragmentScorer = getFragmentScorer(); Fragmenter textFragmenter = getTextFragmenter(); int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze(); Encoder encoder = getEncoder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE); }//from w w w . j a v a2 s. co m TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); // FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; //textFragmenter.start(text, tokenStream); ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream); tokenStream.reset(); //log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens()); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { // if ((offsetAtt.endOffset() > text.length()) // || (offsetAtt.startOffset() > text.length())) { // throw new InvalidTokenOffsetsException("Token " + termAtt.toString() // + " exceeds length of provided text sized " + text.length()); // } // log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")"); tokenGroup.addToken(fragmentScorer.getTokenScore()); } // END FOR //log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens()); for (int i = 0; i < tokenGroup.getNumTokens(); i++) { //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i)); if (tokenGroup.getScore(i) > 0) { Annotation a = new Annotation(); a.setMatched(tokenGroup.getToken(i).toString()); a.setIdDoc(document.get("idDoc")); //contesto sinistro Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i); StringBuilder sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setLeftContext(sb.toString()); sb.setLength(0); //contesto destro t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1, (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1 : tokenGroup.getNumTokens())); sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setRightContext(sb.toString()); a.setConcept(""); a.setType(""); a.setPageNum(-1l); // a.setIdNeo4j(Long.parseLong(document.get("idNeo4j"))); a.setIdNeo4j(Long.parseLong(document.get("idDoc"))); a.setResourceObject(""); a.setId(-1l); ret.add(a); } } return ret; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:it.cnr.isti.hpc.dexter.analysis.DexterAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String str = "<body>perch"; Analyzer anal = new DexterAnalyzer(); TokenStream ts = anal.tokenStream("content", new StringReader(str)); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.println(termAtt.toString().substring(0, termAtt.length())); System.out.println("token start offset: " + offsetAtt.startOffset()); System.out.println(" token end offset: " + offsetAtt.endOffset()); }// w w w . j a v a 2s . c om }
From source file:it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java
License:Apache License
public String clean(String spot) throws IOException { try {/*from w w w .j a v a 2 s. c o m*/ spot = URLDecoder.decode(spot, "UTF-8"); } catch (IllegalArgumentException e) { } analyzer.lowercase(spot.length() > 4); TokenStream ts = analyzer.tokenStream("content", new StringReader(spot)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); sb.setLength(0); int tokens = 0; while (ts.incrementToken()) { tokens++; sb.append(termAtt.toString()); sb.append(' '); if (tokens > maxSpotLength) { return ""; } } ts.end(); ts.reset(); if (sb.length() > 0) sb.setLength(sb.length() - 1); // System.out.println(spot + " -> " + "[" + sb.toString() + "]"); String finalSpot = sb.toString(); for (Filter<String> filter : filters) { if (filter.isFilter(finalSpot)) { finalSpot = ""; } } return finalSpot; }
From source file:it.cnr.isti.hpc.dexter.spot.DocumentFrequencyGenerator.java
License:Apache License
private void initBloomFilter(Iterator<String> spotIterator) { String spot = spotIterator.next(); analyzer.setShingles(false);/*from www .j av a2s . c o m*/ ProgressLogger pl = new ProgressLogger("added {} spots to the bloom filter", 100000); pl.up(); while (spotIterator.hasNext()) { String next = spotIterator.next(); if (next.equals(spot)) continue; pl.up(); spot = next; TokenStream ts = null; try { ts = analyzer.tokenStream("content", new StringReader(spot)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); if (ts.incrementToken()) { spot = termAtt.toString(); bf.add(spot); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
From source file:jobs.LoadOntologyJob.java
private int getTotalLength(String label) throws IOException { //Analyzer doesn't remomve stop words Analyzer customanalyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47); List<String> resultStop = new ArrayList<String>(); TokenStream customstream = customanalyzer.tokenStream(null, new StringReader(label)); customstream.reset(); while (customstream.incrementToken()) { resultStop.add(customstream.getAttribute(CharTermAttribute.class).toString()); }// w w w .j a va2s . com return resultStop.size(); }
From source file:jobs.LoadOntologyJob.java
private int getLengthWithoutStopWords(String label) throws IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); List<String> result = new ArrayList<String>(); TokenStream stream = analyzer.tokenStream(null, new StringReader(label)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); }// w w w. j av a2 s . c o m return result.size(); }
From source file:jp.co.atware.solr.analizers.cjk.MultistageMappingCharFilterTest.java
License:Apache License
@Theory public void testMultiMappingAndOffset(TestData testData) throws Exception { Reader reader = charFilterFactory.create(new StringReader(testData.input)); TokenStream tokenStream = tokenizerFactory.create(reader); OffsetAttribute actualOffset = tokenStream.getAttribute(OffsetAttribute.class); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); assertThat(tokenStream.incrementToken(), is(true)); assertThat(termAtt.toString(), is(testData.expected)); assertThat(actualOffset.startOffset(), is(testData.start)); assertThat(actualOffset.endOffset(), is(testData.end)); assertThat(tokenStream.incrementToken(), is(false)); }
From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java
License:Apache License
private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException { if (!analyzeWildcard) { return super.getPrefixQuery(field, termStr); }//from w ww. j ava2s. c o m // get Analyzer from superclass and tokenize the term TokenStream source; try { source = getAnalyzer().tokenStream(field, termStr); source.reset(); } catch (IOException e) { return super.getPrefixQuery(field, termStr); } List<String> tlist = new ArrayList<>(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); while (true) { try { if (!source.incrementToken()) break; } catch (IOException e) { break; } tlist.add(termAtt.toString()); } try { source.close(); } catch (IOException e) { // ignore } if (tlist.size() == 1) { return super.getPrefixQuery(field, tlist.get(0)); } else { // build a boolean query with prefix on each one... List<BooleanClause> clauses = new ArrayList<>(); for (String token : tlist) { clauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses, true); //return super.getPrefixQuery(field, termStr); /* this means that the analyzer used either added or consumed * (common for a stemmer) tokens, and we can't build a PrefixQuery */ // throw new ParseException("Cannot build PrefixQuery with analyzer " // + getAnalyzer().getClass() // + (tlist.size() > 1 ? " - token(s) added" : " - token consumed")); } }
From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java
License:Apache License
private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException { if (!analyzeWildcard) { return super.getWildcardQuery(field, termStr); }/* ww w .j a v a2 s .com*/ boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*")); StringBuilder aggStr = new StringBuilder(); StringBuilder tmp = new StringBuilder(); for (int i = 0; i < termStr.length(); i++) { char c = termStr.charAt(i); if (c == '?' || c == '*') { if (isWithinToken) { try { TokenStream source = getAnalyzer().tokenStream(field, tmp.toString()); source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); if (source.incrementToken()) { String term = termAtt.toString(); if (term.length() == 0) { // no tokens, just use what we have now aggStr.append(tmp); } else { aggStr.append(term); } } else { // no tokens, just use what we have now aggStr.append(tmp); } source.close(); } catch (IOException e) { aggStr.append(tmp); } tmp.setLength(0); } isWithinToken = false; aggStr.append(c); } else { tmp.append(c); isWithinToken = true; } } if (isWithinToken) { try { TokenStream source = getAnalyzer().tokenStream(field, tmp.toString()); source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); if (source.incrementToken()) { String term = termAtt.toString(); if (term.length() == 0) { // no tokens, just use what we have now aggStr.append(tmp); } else { aggStr.append(term); } } else { // no tokens, just use what we have now aggStr.append(tmp); } source.close(); } catch (IOException e) { aggStr.append(tmp); } } return super.getWildcardQuery(field, aggStr.toString()); }