List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:jaligner.Sequence.java
License:Open Source License
/** * Constructor/*from w w w . j av a2 s . co m*/ * * @param sequence */ public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException { super(); this.sequence = sequence; TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence)); Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory( stream.getAttributeFactory()); Vector<Token> tokenVector = new Vector<Token>(); while (stream.incrementToken() && tokenVector.size() < max_length) { // Token token = new Token(); // Token token = (Token) stream.getAttribute(CharTermAttribute.class); Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class); CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class); // PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class); // FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class); // public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset()); token.setOffset(offset.startOffset(), offset.endOffset()); // token.setPayload(payload.getPayload()); // token.setFlags(flags.getFlags()); if (stream.hasAttribute(PositionIncrementAttribute.class)) { PositionIncrementAttribute positionIncrement = stream .getAttribute(PositionIncrementAttribute.class); token.setPositionIncrement(positionIncrement.getPositionIncrement()); } if (stream.hasAttribute(TypeAttribute.class)) { TypeAttribute type = stream.getAttribute(TypeAttribute.class); token.setType(type.type()); } tokenVector.add(token); } stream.end(); stream.close(); this.tokens = tokenVector.toArray(new Token[tokenVector.size()]); }
From source file:lux.search.highlight.XmlHighlighter.java
License:Mozilla Public License
/** * inspired by org.apache.lucene.search.highlight.Highlighter * * /*from w w w . ja va 2 s . c o m*/ * send highlighted events to the writer * @param reader the input document stream * @param characterOffset beginning of the text to highlight * @param textLength length of the text to highlight * @throws XMLStreamException */ private void highlightTextNode() throws IOException, XMLStreamException { TokenStream tokenStream = analyzer.tokenStream(textFieldName, textReader); xmlStreamTokens.reset(tokenStream); lastEndOffset = 0; for (boolean next = xmlStreamTokens.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = xmlStreamTokens.incrementToken()) { if (scorerTokens != null && xmlStreamTokens.isPlainToken()) { scorerTokens.incrementToken(); } if (tokenGroup.isDistinct()) { // write out any accumulated tokens handleTokenGroup(); tokenGroup.clear(); } if (scorerTokens == null || xmlStreamTokens.isPlainToken()) { tokenGroup.addToken(scorer.getTokenScore()); } } handleTokenGroup(); tokenGroup.clear(); writeTrailingText(); tokenStream.end(); tokenStream.close(); }
From source file:net.mad.ads.server.utils.http.KeywordUtils.java
License:Open Source License
public static List<String> getTokens(String queryString) { try {/*from www . j a v a 2s . c o m*/ GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33); TokenStream ts = a.tokenStream("", new StringReader(queryString)); List<String> tokens = new ArrayList<String>(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = termAtt.toString(); tokens.add(token); } ts.end(); ts.close(); return tokens; } catch (IOException e) { logger.error("", e); } return null; }
From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java
License:Open Source License
private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException { TokenStream ts = analyzer.tokenStream(fld, new StringReader(val)); try {/*w ww .j a v a 2 s. c o m*/ ts.reset(); // Iterate over tokens and treat each token as term int pos = 0; while (ts.incrementToken()) { CharTermAttribute t = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class); pos += p.getPositionIncrement(); phrase.add(new Term(fld, t.toString()), pos - 1); } // End-of-stream clean-up ts.end(); } finally { ts.close(); } }
From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java
License:Open Source License
public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max, int threshold, HashMap<String, String> attributes) { float searchThreshold = (float) threshold; if (threshold < 0) searchThreshold = 0.0f;/*w w w . j a va2s . c om*/ if (threshold > 100) searchThreshold = 100.0f; String queryText = genericText; String gtextFName = TmEntry.GTEXT_PREFIX + locale; Locale javaLoc = new Locale(locale); // create basic ngram analyzer to tokenize query TokenStream queryTokenStream; if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) { queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText)); } else { queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText)); } // Get the TermAttribute from the TokenStream CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class); TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName); try { queryTokenStream.reset(); while (queryTokenStream.incrementToken()) { //Term t = new Term(keyIndexField, new String(termAtt.buffer())); Term t = new Term(gtextFName, termAtt.toString()); fQuery.add(t); } queryTokenStream.end(); queryTokenStream.close(); } catch (IOException e) { throw new OkapiIOException(e.getMessage(), e); } return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes); }
From source file:NewsIR_search.TRECQuery.java
/** * Returns the content of the 'queryField' from the query text * @param analyzer// w w w.ja v a 2s . c om * @param queryField * @return (String) The content of the field * @throws Exception */ public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception { StringBuffer buff = new StringBuffer(); TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:nl.uva.sne.commons.SemanticUtils.java
public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException { text = text.replaceAll("", "'"); text = text.replaceAll("_", " "); text = text.replaceAll("[0-9]", ""); text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " "); text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", ""); text = text.toLowerCase();/*from ww w . ja v a 2 s .c o m*/ TokenStream tokenStream; if (stem) { tokenStream = tokenStemStream("field", new StringReader(text)); } else { tokenStream = tokenStream("field", new StringReader(text)); } ArrayList<String> words = new ArrayList<>(); try { CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { words.add(term.toString()); } tokenStream.end(); } finally { tokenStream.close(); } // Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens"); return words; }
From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java
License:Open Source License
/** * Check that the TokenStream yields the exact tokens specified. * Note that order is not checked, since the map of locales will not provide a * predictable ordering when enumerated. * //from w w w .ja v a 2 s . c o m * The expected list of tokens may contain the same token more than once and * the number of instances will have to match the number found in the stream. * * @param ts TokenStream to inspect. * @param expectedTokens List of tokens in the order expected from the stream. * @throws IOException */ private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException { final int expectedCount = expectedTokens.size(); int count = 0; CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { count++; System.out.println("Token: " + termAtt.toString()); if (expectedTokens.contains(termAtt.toString())) { // remove an instance of the term text so that it is not matched again expectedTokens.remove(termAtt.toString()); } else { fail("Unexpected token: " + termAtt.toString()); } } ts.end(); } finally { ts.close(); } assertEquals("Incorrect number of tokens generated.", expectedCount, count); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java
License:Open Source License
private void tokenise(TokenStream ts, String[] tokens) throws IOException { int i = 0;/* w w w . jav a2 s . c om*/ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println("token: " + ts.reflectAsString(true)); String termText = termAtt.toString(); if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) { assert (i % 2 == 1); assertEquals(termText, tokens[i++]); } } ts.end(); } finally { ts.close(); } if (i != tokens.length) { fail("Invalid number of tokens, found " + i + " and expected " + tokens.length); } }
From source file:org.alfresco.solr.AlfrescoFieldType.java
License:Open Source License
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; TokenStream source = null; try {//from w w w. j a v a2 s.co m source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); if (source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }