List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTest.java
License:Apache License
@Test(expected = IllegalArgumentException.class) public void testInValidNumberOfExpansions() throws IOException { Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < 256; i++) { builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true); }//from ww w .ja va 2s .c om StringBuilder valueBuilder = new StringBuilder(); for (int i = 0; i < 9; i++) { // 9 -> expands to 512 valueBuilder.append(i + 1); valueBuilder.append(" "); } MockTokenizer tokenizer = new MockTokenizer(new StringReader(valueBuilder.toString()), MockTokenizer.WHITESPACE, true); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { @Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { Set<IntsRef> finiteStrings = suggester .toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); return finiteStrings; } }); suggestTokenStream.reset(); suggestTokenStream.incrementToken(); suggestTokenStream.close(); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTest.java
License:Apache License
@Test public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception { TokenStream tokenStream = new MockTokenizer(new StringReader("mykeyword"), MockTokenizer.WHITESPACE, true); BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter( new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() { @Override// w w w . j av a2s . c o m public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { return suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); } })); TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class); BytesRef ref = termAtt.getBytesRef(); assertNotNull(ref); suggestTokenStream.reset(); while (suggestTokenStream.incrementToken()) { termAtt.fillBytesRef(); assertThat(ref.utf8ToString(), equalTo("mykeyword")); } suggestTokenStream.end(); suggestTokenStream.close(); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test public void testValidNumberOfExpansions() throws IOException { Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < 256; i++) { builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true); }//from w ww. j ava2 s. c o m StringBuilder valueBuilder = new StringBuilder(); for (int i = 0; i < 8; i++) { valueBuilder.append(i + 1); valueBuilder.append(" "); } MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { @Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { Set<IntsRef> finiteStrings = suggester.toFiniteStrings(stream); return finiteStrings; } }); suggestTokenStream.reset(); ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class); PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class); int maxPos = 0; int count = 0; while (suggestTokenStream.incrementToken()) { count++; assertNotNull(attr.getBytesRef()); assertTrue(attr.getBytesRef().length > 0); maxPos += posAttr.getPositionIncrement(); } suggestTokenStream.close(); assertEquals(count, 256); assertEquals(count, maxPos); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test(expected = IllegalArgumentException.class) public void testInValidNumberOfExpansions() throws IOException { Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < 256; i++) { builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true); }//from ww w . j a va 2 s . c om StringBuilder valueBuilder = new StringBuilder(); for (int i = 0; i < 9; i++) { // 9 -> expands to 512 valueBuilder.append(i + 1); valueBuilder.append(" "); } MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { @Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { Set<IntsRef> finiteStrings = suggester.toFiniteStrings(stream); return finiteStrings; } }); suggestTokenStream.reset(); suggestTokenStream.incrementToken(); suggestTokenStream.close(); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader("mykeyword")); BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter( new CompletionTokenStream(tokenizer, payload, new CompletionTokenStream.ToFiniteStrings() { @Override//from w ww .ja v a 2 s . c om public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { return suggester.toFiniteStrings(stream); } })); TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class); assertNotNull(termAtt.getBytesRef()); suggestTokenStream.reset(); while (suggestTokenStream.incrementToken()) { assertThat(termAtt.getBytesRef().utf8ToString(), equalTo("mykeyword")); } suggestTokenStream.end(); suggestTokenStream.close(); }
From source file:org.elasticsearch.search.suggest.SuggestUtils.java
License:Apache License
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException { stream.reset(); consumer.reset(stream);//from ww w .java 2s. c o m int numTokens = 0; while (stream.incrementToken()) { consumer.nextToken(); numTokens++; } consumer.end(); stream.close(); return numTokens; }
From source file:org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.java
License:Apache License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); Assert.assertNotNull(termAttr);/*w w w .ja va 2s.c o m*/ int i = 0; while (stream.incrementToken()) { Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString()); Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i); i++; } Assert.assertEquals(i, expected.length, "not all tokens produced"); }
From source file:org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.java
License:Apache License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected, int[] posInc) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttr = stream.getAttribute(PositionIncrementAttribute.class); Assert.assertNotNull(termAttr);//w w w. j a v a 2 s . c o m int i = 0; while (stream.incrementToken()) { Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString()); Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i); Assert.assertEquals(posIncAttr.getPositionIncrement(), posInc[i]); i++; } Assert.assertEquals(i, expected.length, "not all tokens produced"); }
From source file:org.exist.indexing.lucene.LuceneMatchListener.java
License:Open Source License
private void scanMatches(NodeProxy p) { // Collect the text content of all descendants of p. // Remember the start offsets of the text nodes for later use. NodePath path = getPath(p);/*from ww w . j a va 2 s . c om*/ LuceneIndexConfig idxConf = config.getConfig(path).next(); TextExtractor extractor = new DefaultTextExtractor(); extractor.configure(config, idxConf); OffsetList offsets = new OffsetList(); int level = 0; int textOffset = 0; try { EmbeddedXMLStreamReader reader = broker.getXMLStreamReader(p, false); while (reader.hasNext()) { int ev = reader.next(); switch (ev) { case XMLStreamConstants.END_ELEMENT: if (--level < 0) { break; } textOffset += extractor.endElement(reader.getQName()); break; case XMLStreamConstants.START_ELEMENT: ++level; textOffset += extractor.startElement(reader.getQName()); break; case XMLStreamConstants.CHARACTERS: NodeId nodeId = (NodeId) reader.getProperty(ExtendedXMLStreamReader.PROPERTY_NODE_ID); textOffset += extractor.beforeCharacters(); offsets.add(textOffset, nodeId); textOffset += extractor.characters(reader.getXMLText()); break; } } } catch (IOException | XMLStreamException e) { LOG.warn("Problem found while serializing XML: " + e.getMessage(), e); } // Retrieve the Analyzer for the NodeProxy that was used for // indexing and querying. Analyzer analyzer = idxConf.getAnalyzer(); if (analyzer == null) { // Otherwise use system default Lucene analyzer (from conf.xml) // to tokenize the text and find matching query terms. analyzer = index.getDefaultAnalyzer(); } LOG.debug("Analyzer: " + analyzer + " for path: " + path); String str = extractor.getText().toString(); //Token token; try { TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(str)); tokenStream.reset(); MarkableTokenFilter stream = new MarkableTokenFilter(tokenStream); while (stream.incrementToken()) { String text = stream.getAttribute(CharTermAttribute.class).toString(); Query query = termMap.get(text); if (query != null) { // Phrase queries need to be handled differently to filter // out wrong matches: only the phrase should be marked, not // single words which may also occur elsewhere in the document if (query instanceof PhraseQuery) { PhraseQuery phraseQuery = (PhraseQuery) query; Term[] terms = phraseQuery.getTerms(); if (text.equals(terms[0].text())) { // Scan the following text and collect tokens to see // if they are part of the phrase. stream.mark(); int t = 1; List<State> stateList = new ArrayList<>(terms.length); stateList.add(stream.captureState()); while (stream.incrementToken() && t < terms.length) { text = stream.getAttribute(CharTermAttribute.class).toString(); if (text.equals(terms[t].text())) { stateList.add(stream.captureState()); if (++t == terms.length) { break; } } else { // Don't reset the token stream since we will // miss matches. /ljo //stream.reset(); break; } } if (stateList.size() == terms.length) { // we indeed have a phrase match. record the offsets of its terms. int lastIdx = -1; for (int i = 0; i < terms.length; i++) { stream.restoreState(stateList.get(i)); OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class); int idx = offsets.getIndex(offsetAttr.startOffset()); NodeId nodeId = offsets.ids[idx]; Offset offset = nodesWithMatch.get(nodeId); if (offset != null) if (lastIdx == idx) offset.setEndOffset(offsetAttr.endOffset() - offsets.offsets[idx]); else offset.add(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx]); else nodesWithMatch.put(nodeId, new Offset(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx])); lastIdx = idx; } } } // End of phrase handling } else { OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class); int idx = offsets.getIndex(offsetAttr.startOffset()); NodeId nodeId = offsets.ids[idx]; Offset offset = nodesWithMatch.get(nodeId); if (offset != null) offset.add(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx]); else { nodesWithMatch.put(nodeId, new Offset(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx])); } } } } } catch (IOException e) { LOG.warn("Problem found while serializing XML: " + e.getMessage(), e); } }
From source file:org.exist.indexing.lucene.XMLToQuery.java
License:Open Source License
private Query phraseQuery(String field, Element node, Analyzer analyzer) throws XPathException { NodeList termList = node.getElementsByTagName("term"); if (termList.getLength() == 0) { PhraseQuery query = new PhraseQuery(); String qstr = getText(node); try {// w w w. j a va 2 s.co m TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { query.add(new Term(field, termAttr.toString())); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; } MultiPhraseQuery query = new MultiPhraseQuery(); for (int i = 0; i < termList.getLength(); i++) { Element elem = (Element) termList.item(i); String text = getText(elem); if (text.indexOf('?') > -1 || text.indexOf('*') > 0) { Term[] expanded = expandTerms(field, text); if (expanded.length > 0) query.add(expanded); } else { String termStr = getTerm(field, text, analyzer); if (termStr != null) query.add(new Term(field, text)); } } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; }