List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java
License:Apache License
/** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow *//*from w ww. j a va 2 s. c o m*/ protected TextPattern newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.tokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { ParseException p = new ParseException("Unable to initialize TokenStream to analyze query text"); p.initCause(e); throw p; } CachingTokenFilter buffer = new CachingTokenFilter(source); TermToBytesRefAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; buffer.reset(); if (buffer.hasAttribute(TermToBytesRefAttribute.class)) { termAtt = buffer.getAttribute(TermToBytesRefAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { // ignore } } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { ParseException p = new ParseException("Cannot close TokenStream analyzing query text"); p.initCause(e); throw p; } BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef(); if (numTokens == 0) return null; else if (numTokens == 1) { try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); } else { if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) { if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) { // no phrase query: TextPatternBoolean q = newBooleanQuery(positionCount == 1); // BL: BooleanQuery -> TextPatternBoolean BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD; for (int i = 0; i < numTokens; i++) { try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } TextPattern currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); q.add(currentQuery, occur); } return q; } else { // phrase query: TPMultiPhrase mpq = newMultiPhraseQuery(); // BL: MultiPhraseQuery -> TPMultiPhrase mpq.setSlop(phraseSlop); List<Term> multiTerms = new ArrayList<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes))); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } return mpq; } } else { TPPhrase pq = newPhraseQuery(); // BL: PhraseQuery -> TPPhrase pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, BytesRef.deepCopyOf(bytes)), position); } else { pq.add(new Term(field, BytesRef.deepCopyOf(bytes))); } } return pq; } } }
From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java
License:Apache License
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { TokenStream source; if (analyzerIn == null) analyzerIn = analyzer;/*from ww w.j av a2 s . co m*/ try { source = analyzerIn.tokenStream(field, new StringReader(part)); source.reset(); } catch (IOException e) { throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e); } TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); try { if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); } catch (IOException e) { throw new RuntimeException("error analyzing range part: " + part, e); } try { source.end(); source.close(); } catch (IOException e) { throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e); } return BytesRef.deepCopyOf(bytes); }
From source file:org.apache.fuzzydb.queryParser.QueryParser.java
License:Open Source License
/** * @exception ParseException throw in overridden method to disallow *///from w w w . j av a 2 s . c o m protected Query getFieldQuery(String field, String queryText) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.reusableTokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { source = analyzer.tokenStream(field, new StringReader(queryText)); } CachingTokenFilter buffer = new CachingTokenFilter(source); TermAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; boolean success = false; try { buffer.reset(); success = true; } catch (IOException e) { // success==false if we hit an exception } if (success) { if (buffer.hasAttribute(TermAttribute.class)) { termAtt = buffer.getAttribute(TermAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { // ignore } } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { // ignore } if (numTokens == 0) return null; else if (numTokens == 1) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } return newTermQuery(new Term(field, term)); } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = newBooleanQuery(true); for (int i = 0; i < numTokens; i++) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } Query currentQuery = newTermQuery(new Term(field, term)); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // phrase query: MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(phraseSlop); List<Term> multiTerms = new ArrayList<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, term)); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } return mpq; } } else { PhraseQuery pq = newPhraseQuery(); pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, term), position); } else { pq.add(new Term(field, term)); } } return pq; } } }
From source file:org.apache.jackrabbit.core.query.lucene.AbstractIndex.java
License:Apache License
/** * Returns a document that is finished with text extraction and is ready to * be added to the index.//from w w w.j a v a2 s.c o m * * @param doc the document to check. * @return <code>doc</code> if it is finished already or a stripped down * copy of <code>doc</code> without text extractors. * @throws IOException if the document cannot be added to the indexing * queue. */ private Document getFinishedDocument(Document doc) throws IOException { if (!Util.isDocumentReady(doc)) { Document copy = new Document(); // mark the document that reindexing is required copy.add(new Field(FieldNames.REINDEXING_REQUIRED, false, "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); for (Fieldable f : doc.getFields()) { Fieldable field = null; Field.TermVector tv = getTermVectorParameter(f); Field.Store stored = f.isStored() ? Field.Store.YES : Field.Store.NO; Field.Index indexed = getIndexParameter(f); if (f instanceof LazyTextExtractorField || f.readerValue() != null) { // replace all readers with empty string reader field = new Field(f.name(), new StringReader(""), tv); } else if (f.stringValue() != null) { field = new Field(f.name(), false, f.stringValue(), stored, indexed, tv); } else if (f.isBinary()) { field = new Field(f.name(), f.getBinaryValue(), stored); } else if (f.tokenStreamValue() != null && f.tokenStreamValue() instanceof SingletonTokenStream) { TokenStream tokenStream = f.tokenStreamValue(); TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); tokenStream.reset(); field = new Field(f.name(), new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone())); } if (field != null) { field.setOmitNorms(f.getOmitNorms()); copy.add(field); } } // schedule the original document for later indexing Document existing = indexingQueue.addDocument(doc); if (existing != null) { // the queue already contained a pending document for this // node. -> dispose the document Util.disposeDocument(existing); } // use the stripped down copy for now doc = copy; } return doc; }
From source file:org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser.java
License:Apache License
/** * {@inheritDoc}/*from w w w. j a v a 2s. c om*/ */ protected Query getPrefixQuery(String field, String termStr) throws ParseException { // only create a prefix query when the term is a single word / token Analyzer a = getAnalyzer(); TokenStream ts = a.tokenStream(field, new StringReader(termStr)); int count = 0; boolean isCJ = false; try { TypeAttribute t = ts.addAttribute(TypeAttribute.class); ts.reset(); while (ts.incrementToken()) { count++; isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type()); } ts.end(); } catch (IOException e) { throw new ParseException(e.getMessage()); } finally { try { ts.close(); } catch (IOException e) { // ignore } } if (count > 1 && isCJ) { return getFieldQuery(field, termStr); } else { return getWildcardQuery(field, termStr + "*"); } }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java
License:Apache License
/** * Tries to merge back tokens that are split on relevant fulltext query * wildcards ('*' or '?')//from w ww . j a v a 2 s .c om * * * @param text * @param analyzer * @return */ static List<String> tokenize(String text, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); TokenStream stream = null; try { stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); // TypeAttribute type = stream.addAttribute(TypeAttribute.class); stream.reset(); int poz = 0; boolean hasFulltextToken = false; StringBuilder token = new StringBuilder(); while (stream.incrementToken()) { String term = termAtt.toString(); int start = offsetAtt.startOffset(); int end = offsetAtt.endOffset(); if (start > poz) { for (int i = poz; i < start; i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); hasFulltextToken = true; } } } } poz = end; if (hasFulltextToken) { token.append(term); hasFulltextToken = false; } else { if (token.length() > 0) { tokens.add(token.toString()); } token = new StringBuilder(); token.append(term); } } // consume to the end of the string if (poz < text.length()) { for (int i = poz; i < text.length(); i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); } } } } if (token.length() > 0) { tokens.add(token.toString()); } stream.end(); } catch (IOException e) { LOG.error("Building fulltext query failed", e.getMessage()); return null; } finally { try { if (stream != null) { stream.close(); } } catch (IOException e) { // ignore } } return tokens; }
From source file:org.apache.mahout.classifier.NewsgroupHelper.java
License:Apache License
public static void countWords(Analyzer analyzer, Collection<String> words, Reader in, Multiset<String> overallCounts) throws IOException { TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);// w w w.ja va 2s . c o m } overallCounts.addAll(words); ts.end(); Closeables.close(ts, true); }
From source file:org.apache.mahout.classifier.sgd.NewsgroupHelper.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in, Multiset<String> overallCounts) throws IOException { TokenStream ts = analyzer.reusableTokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);/*from ww w . j a va 2 s . c o m*/ } overallCounts.addAll(words); }
From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java
License:Apache License
@Test public void testAnalysis() throws Exception { Analyzer analyzer = new MailArchivesClusteringAnalyzer(); String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n" + "Mahout is a scalable, machine-learning LIBRARY\n" + "we've added some additional stopwords such as html, mailto, regards\t" + "apache_hadoop provides the foundation for scalability\n" + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package"; Reader reader = new StringReader(text); // if you change the text above, then you may need to change this as well // order matters too String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad", "stopword", "apache_hadoop", "provid", "foundat", "scalabl" }; TokenStream tokenStream = analyzer.tokenStream("test", reader); assertNotNull(tokenStream);/*from w ww . j a v a 2 s. c o m*/ tokenStream.reset(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); int e = 0; while (tokenStream.incrementToken() && e < expectedTokens.length) { assertEquals(expectedTokens[e++], termAtt.toString()); } assertEquals(e, expectedTokens.length); tokenStream.end(); tokenStream.close(); }
From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String document = value.toString(); document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN .matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll("")); String catMatch = findMatchingCategory(document); if (!"Unknown".equals(catMatch)) { StringBuilder contents = new StringBuilder(1000); TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { contents.append(termAtt.buffer(), 0, termAtt.length()).append(' '); }/*from www. j a v a 2 s . c o m*/ context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), new Text(contents.toString())); stream.end(); Closeables.close(stream, true); } }