List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:org.sindice.siren.qparser.keyword.query.processors.AnalyzerQueryNodeProcessor.java
License:Apache License
@Override protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeException { if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof ParametricQueryNode)) { final FieldQueryNode fieldNode = ((FieldQueryNode) node); final String text = fieldNode.getTextAsString(); final String field = fieldNode.getFieldAsString(); final TokenStream source = this.analyzer.tokenStream(field, new StringReader(text)); final CachingTokenFilter buffer = new CachingTokenFilter(source); PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; boolean severalTokensAtSamePosition = false; if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); }//from w w w.ja va 2 s. c om try { while (buffer.incrementToken()) { numTokens++; final int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } catch (final IOException e) { // ignore } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (final IOException e) { // ignore } if (!buffer.hasAttribute(CharTermAttribute.class)) { return new NoTokenFoundQueryNode(); } final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class); if (numTokens == 0) { return new NoTokenFoundQueryNode(); } else if (numTokens == 1) { String term = null; try { boolean hasNext; hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (final IOException e) { // safe to ignore, because we know the number of tokens } fieldNode.setText(term); return fieldNode; } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) { if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) { // no phrase query: final LinkedList<QueryNode> children = new LinkedList<QueryNode>(); for (int i = 0; i < numTokens; i++) { String term = null; try { final boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (final IOException e) { // safe to ignore, because we know the number of tokens } children.add(new FieldQueryNode(field, term, -1, -1)); } // If multiple terms at one single position, this must be a query // expansion. Perform a OR between the terms. if (severalTokensAtSamePosition && positionCount == 1) { return new GroupQueryNode(new OrQueryNode(children)); } else if (positionCount == 1) { return new GroupQueryNode(new StandardBooleanQueryNode(children, true)); } else { return new StandardBooleanQueryNode(children, false); } } else { // phrase query: final MultiPhraseQueryNode mpq = new MultiPhraseQueryNode(); final List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>(); int position = -1; int i = 0; int termGroupCount = 0; for (; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { final boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (final IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { for (final FieldQueryNode termNode : multiTerms) { if (this.positionIncrementsEnabled) { termNode.setPositionIncrement(position); } else { termNode.setPositionIncrement(termGroupCount); } mpq.add(termNode); } // Only increment once for each "group" of // terms that were in the same position: termGroupCount++; multiTerms.clear(); } position += positionIncrement; multiTerms.add(new FieldQueryNode(field, term, -1, -1)); } for (final FieldQueryNode termNode : multiTerms) { if (this.positionIncrementsEnabled) { termNode.setPositionIncrement(position); } else { termNode.setPositionIncrement(termGroupCount); } mpq.add(termNode); } return mpq; } } else { final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { final boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (final IOException e) { // safe to ignore, because we know the number of tokens } final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.setPositionIncrement(position); } else { newFieldNode.setPositionIncrement(i); } pq.add(newFieldNode); } return pq; } } return node; }
From source file:org.sindice.siren.qparser.ntriple.query.processors.ResourceAnalyzerQueryNodeProcessor.java
License:Apache License
@Override protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeException { if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof ParametricQueryNode)) { final FieldQueryNode fieldNode = ((FieldQueryNode) node); final String text = fieldNode.getTextAsString(); final String field = fieldNode.getFieldAsString(); final TokenStream source = this.analyzer.tokenStream(field, new StringReader(text)); final CachingTokenFilter buffer = new CachingTokenFilter(source); PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; boolean severalTokensAtSamePosition = false; if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); }//from www . j a va2s .com try { while (buffer.incrementToken()) { numTokens++; final int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } catch (final IOException e) { // ignore } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (final IOException e) { // ignore } if (!buffer.hasAttribute(CharTermAttribute.class)) { return new NoTokenFoundQueryNode(); } final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class); if (numTokens == 0) { return new NoTokenFoundQueryNode(); } else if (numTokens == 1) { String term = null; try { boolean hasNext; hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (final IOException e) { // safe to ignore, because we know the number of tokens } fieldNode.setText(term); return fieldNode; } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) { if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) { // no phrase query: final LinkedList<QueryNode> children = new LinkedList<QueryNode>(); for (int i = 0; i < numTokens; i++) { String term = null; try { final boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (final IOException e) { // safe to ignore, because we know the number of tokens } children.add(new FieldQueryNode(field, term, -1, -1)); } // If multiple terms at one single position, this must be a query // expansion. Perform a OR between the terms. if (positionCount == 1) { final LinkedList<QueryNode> modified = new LinkedList<QueryNode>(); for (final QueryNode child : children) { modified.add(new ModifierQueryNode(child, Modifier.MOD_NONE)); } return new GroupQueryNode(new StandardBooleanQueryNode(modified, true)); } // Multiple terms over more than one position. // Not able to support such a case. Usually, it is the result of a bad // use of filters at query time. Better to throw an exception. else { throw new QueryNodeException(new MessageImpl("Multiple terms over more than one position")); } } else { // phrase query: final MultiPhraseQueryNode mpq = new MultiPhraseQueryNode(); final List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>(); int position = -1; int i = 0; int termGroupCount = 0; for (; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { final boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (final IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { for (final FieldQueryNode termNode : multiTerms) { if (this.positionIncrementsEnabled) { termNode.setPositionIncrement(position); } else { termNode.setPositionIncrement(termGroupCount); } mpq.add(termNode); } // Only increment once for each "group" of // terms that were in the same position: termGroupCount++; multiTerms.clear(); } position += positionIncrement; multiTerms.add(new FieldQueryNode(field, term, -1, -1)); } for (final FieldQueryNode termNode : multiTerms) { if (this.positionIncrementsEnabled) { termNode.setPositionIncrement(position); } else { termNode.setPositionIncrement(termGroupCount); } mpq.add(termNode); } return mpq; } } else { final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { final boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (final IOException e) { // safe to ignore, because we know the number of tokens } final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.setPositionIncrement(position); } else { newFieldNode.setPositionIncrement(i); } pq.add(newFieldNode); } return pq; } } return node; }
From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java
License:Apache License
private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, s); CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class); ts.reset();/*from w w w. j a v a2 s . c o m*/ while (ts.incrementToken()) { set.add(cattr.toString()); } ts.end(); ts.close(); }
From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java
License:Apache License
private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue, TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException { //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true); TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue); stream.reset();/*from www . j a va 2 s. c o m*/ int defaultInc = 1; CharTermAttribute termAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); OffsetAttribute offsetAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); PositionIncrementAttribute incAtt = null; if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) { incAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); } while (stream.incrementToken()) { //Do we need this? if (incAtt != null && incAtt.getPositionIncrement() == 0) { continue; } currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc; if (requests.contains(currInd)) { results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase, termAtt.toString()); } if (currInd > requests.getLast()) { // TODO: Is there a way to avoid this? Or, is this // an imaginary performance hit? while (stream.incrementToken()) { //NO-OP } stream.end(); stream.close(); return GOT_ALL_REQUESTS; } } stream.end(); stream.close(); return currInd; }
From source file:org.tallison.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil.java
License:Apache License
/** * allows reuse of terms, this method calls terms.clear() before adding new * terms/*from w ww . ja v a 2 s .com*/ * * @param s string to analyze * @param field to use in analysis * @param analyzer analyzer * @param terms list for reuse * @return list of strings * @throws java.io.IOException if there's an IOException during analysis */ public static List<String> getTermStrings(String s, String field, Analyzer analyzer, List<String> terms) throws IOException { if (terms == null) { terms = new ArrayList<>(); } terms.clear(); TokenStream stream = analyzer.tokenStream(field, s); stream.reset(); CharTermAttribute termAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); while (stream.incrementToken()) { terms.add(termAtt.toString()); } stream.end(); stream.close(); return terms; }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testBasicNoUnigrams() throws Exception { Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s); tokenStream.reset();/*from www. j a v a 2 s . c om*/ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", }); List<String> returned = new ArrayList<>(); while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); assertEquals(1, posIncAttribute.getPositionIncrement()); returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testIncludeUnigrams() throws Exception { List<String> expected = Arrays.asList( new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", }); Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream("f", s); tokenStream.reset();/*from www . j a va2s . c om*/ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> returned = new ArrayList<>(); int i = 0; while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); if (i++ % 2 == 0) { assertEquals(1, posIncAttribute.getPositionIncrement()); } else { assertEquals(0, posIncAttribute.getPositionIncrement()); } returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testCJKNoUnigrams() throws Exception { final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET; int posIncGap = 10; final int charOffsetGap = 10; Analyzer analyzer = getCJKBigramAnalyzer(false); TokenStream ts = analyzer.tokenStream(FIELD, ""); ts.reset();/*from w ww . j a va2 s. c o m*/ CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class); ts.end(); ts.close(); String[] docs = new String[] { "" }; Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD))); Query q = new TermQuery(new Term(FIELD, "")); //now test straight and span wrapper ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, q, analyzer, collector); for (ConcordanceWindow w : collector.getWindows()) { //System.out.println(w); } reader.close(); directory.close(); }
From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java
License:Apache License
public static ArrayList<String> getTopicWord(String str) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); ArrayList<String> retData = new ArrayList<String>(); // ?LuceneTokenStream TokenStream ts = null; try {/* w w w. j a v a 2 s . c o m*/ ts = analyzer.tokenStream("myfield", new StringReader(str)); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) { retData.add(term.toString()); } } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } return retData; }
From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); // ?LuceneTokenStream TokenStream ts = null; try {/* www .j a v a2 s. c o m*/ ts = analyzer.tokenStream("myfield", new StringReader("???")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }