List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:edu.virginia.cs.utility.StringTokenizer.java
/** * Method that generates list of tokens from the parameter string. * * @param string//from ww w .j a va 2s . c om * @return list of tokens generated */ public List<String> TokenizeString(String string) { List<String> result = new ArrayList<>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:elhuyar.bilakit.PayloadQParserPlugin.java
License:Open Source License
@Override protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError { SchemaField sf = this.schema.getFieldOrNull(field); if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) { //analyze queryText List<String> result = new ArrayList<String>(); try {/*from w w w. ja v a 2s . c o m*/ TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } String analyzedqueryText = ""; analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " "); queryText = analyzedqueryText; // Note that this will work for any field defined with the // <fieldType> of "*_payloads" Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true); return plter; } return super.getFieldQuery(field, queryText, quoted); }
From source file:ikanalyzer.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); // ?LuceneTokenStream TokenStream ts = null; try {/*from w ww . j a v a 2 s .c o m*/ ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:indexer.LineDocumentIndexer.java
Document constructDoc(FileWriter fw, String id, String line) throws Exception { Document doc = new Document(); doc.add(new Field(DocVector.FIELD_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED)); StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream(FIELD_WORDS, new StringReader(line)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();// w ww .ja v a 2s . c o m while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); tokenizedContentBuff.append("\n"); fw.write(id + "\t" + tokenizedContentBuff.toString()); // Reanalyze doc.add(new Field(FIELD_WORDS, line, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); return doc; }
From source file:indexer.Paragraph.java
List<Paragraph> constructParagraphs(int docId, String content) throws Exception { List<Paragraph> parList = new ArrayList<>(); List<String> tokens = new ArrayList<>(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/* ww w. j a va 2 s .c o m*/ int count = 0; int id = 0; while (stream.incrementToken()) { String term = termAtt.toString(); tokens.add(term); count++; if (count == paraWindowSize) { // create a paragraph Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens); tokens.clear(); count = 0; parList.add(p); } } if (count > 0) { Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens); parList.add(p); } stream.end(); stream.close(); return parList; }
From source file:info.johtani.elasticsearch.action.admin.indices.extended.analyze.TransportExtendedAnalyzeAction.java
License:Apache License
private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException { List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>(); stream.reset();/*from ww w. jav a 2 s . c om*/ //and each tokens output CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName))); } stream.end(); return tokens; }
From source file:io.anserini.analysis.TweetTokenizationTest.java
License:Apache License
public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException { List<String> list = new ArrayList<>(); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords)); CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*from ww w.ja va 2 s.c o m*/ while (tokenStream.incrementToken()) { if (cattr.toString().length() == 0) { continue; } list.add(cattr.toString()); } tokenStream.end(); tokenStream.close(); return list; }
From source file:it.cnr.ilc.lc.clavius.search.ClaviusHighlighter.java
public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, String idDoc, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { List<Annotation> ret = new ArrayList<>(); ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>(); StringBuilder newText = new StringBuilder(); Scorer fragmentScorer = getFragmentScorer(); Fragmenter textFragmenter = getTextFragmenter(); int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze(); Encoder encoder = getEncoder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE); }//from www.j a v a 2 s.co m TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); // FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; //textFragmenter.start(text, tokenStream); ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream); tokenStream.reset(); // log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens()); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { // if ((offsetAtt.endOffset() > text.length()) // || (offsetAtt.startOffset() > text.length())) { // throw new InvalidTokenOffsetsException("Token " + termAtt.toString() // + " exceeds length of provided text sized " + text.length()); // } // log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")"); tokenGroup.addToken(fragmentScorer.getTokenScore()); } // END FOR // log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens()); for (int i = 0; i < tokenGroup.getNumTokens(); i++) { //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i)); if (tokenGroup.getScore(i) > 0) { Annotation a = new Annotation(); a.setMatched(tokenGroup.getToken(i).toString()); a.setIdDoc(idDoc); //contesto sinistro Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i); StringBuilder sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setLeftContext(sb.toString()); sb.setLength(0); //contesto destro t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1, (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1 : tokenGroup.getNumTokens())); sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setRightContext(sb.toString()); a.setConcept(""); a.setType(""); a.setIdNeo4j(-1l); a.setPageNum(-1l); a.setResourceObject(""); a.setId(-1l); ret.add(a); } } return ret; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:it.cnr.ilc.lc.claviusweb.fulltextsearch.ClaviusHighlighter.java
public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, Document document, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { List<Annotation> ret = new ArrayList<>(); ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>(); StringBuilder newText = new StringBuilder(); Scorer fragmentScorer = getFragmentScorer(); Fragmenter textFragmenter = getTextFragmenter(); int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze(); Encoder encoder = getEncoder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE); }/*w ww .j av a 2 s.c om*/ TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); // FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; //textFragmenter.start(text, tokenStream); ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream); tokenStream.reset(); //log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens()); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { // if ((offsetAtt.endOffset() > text.length()) // || (offsetAtt.startOffset() > text.length())) { // throw new InvalidTokenOffsetsException("Token " + termAtt.toString() // + " exceeds length of provided text sized " + text.length()); // } // log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")"); tokenGroup.addToken(fragmentScorer.getTokenScore()); } // END FOR //log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens()); for (int i = 0; i < tokenGroup.getNumTokens(); i++) { //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i)); if (tokenGroup.getScore(i) > 0) { Annotation a = new Annotation(); a.setMatched(tokenGroup.getToken(i).toString()); a.setIdDoc(document.get("idDoc")); //contesto sinistro Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i); StringBuilder sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setLeftContext(sb.toString()); sb.setLength(0); //contesto destro t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1, (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1 : tokenGroup.getNumTokens())); sb = new StringBuilder(); for (int j = 0; j < t.length; j++) { sb.append(t[j].toString()); if (j < t.length - 1) { sb.append(" "); } } a.setRightContext(sb.toString()); a.setConcept(""); a.setType(""); a.setPageNum(-1l); // a.setIdNeo4j(Long.parseLong(document.get("idNeo4j"))); a.setIdNeo4j(Long.parseLong(document.get("idDoc"))); a.setResourceObject(""); a.setId(-1l); ret.add(a); } } return ret; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java
License:Apache License
public String clean(String spot) throws IOException { try {// w w w . j a v a 2s.co m spot = URLDecoder.decode(spot, "UTF-8"); } catch (IllegalArgumentException e) { } analyzer.lowercase(spot.length() > 4); TokenStream ts = analyzer.tokenStream("content", new StringReader(spot)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); sb.setLength(0); int tokens = 0; while (ts.incrementToken()) { tokens++; sb.append(termAtt.toString()); sb.append(' '); if (tokens > maxSpotLength) { return ""; } } ts.end(); ts.reset(); if (sb.length() > 0) sb.setLength(sb.length() - 1); // System.out.println(spot + " -> " + "[" + sb.toString() + "]"); String finalSpot = sb.toString(); for (Filter<String> filter : filters) { if (filter.isFilter(finalSpot)) { finalSpot = ""; } } return finalSpot; }