List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.sindice.siren.qparser.keyword.processors.DatatypeAnalyzerProcessor.java
License:Apache License
@Override protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeException { if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof RegexpQueryNode) && !(node.getParent() instanceof RangeQueryNode)) { this.positionIncrementsEnabled = false; final Boolean positionIncrementsEnabled = this.getQueryConfigHandler() .get(ConfigurationKeys.ENABLE_POSITION_INCREMENTS); if (positionIncrementsEnabled != null) { this.positionIncrementsEnabled = positionIncrementsEnabled; }/* w w w . ja v a 2 s . co m*/ final FieldQueryNode fieldNode = ((FieldQueryNode) node); final String text = fieldNode.getTextAsString(); final String field = fieldNode.getFieldAsString(); final String datatype = (String) fieldNode.getTag(DatatypeQueryNode.DATATYPE_TAGID); if (datatype == null) { return node; } final Analyzer analyzer = this.getQueryConfigHandler().get(KeywordConfigurationKeys.DATATYPES_ANALYZERS) .get(datatype); if (analyzer == null) { throw new QueryNodeException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX, "No analyzer associated with " + datatype)); } PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; boolean severalTokensAtSamePosition = false; final TokenStream source; try { source = analyzer.tokenStream(field, new StringReader(text)); source.reset(); } catch (final IOException e1) { throw new RuntimeException(e1); } final CachingTokenFilter buffer = new CachingTokenFilter(source); if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } try { while (buffer.incrementToken()) { numTokens++; final int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } catch (final IOException e) { // ignore } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (final IOException e) { // ignore } if (!buffer.hasAttribute(CharTermAttribute.class)) { return new NoTokenFoundQueryNode(); } final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class); if (numTokens == 0) { if (nbTwigs != 0) { // Twig special case return new WildcardNodeQueryNode(); } return new NoTokenFoundQueryNode(); } else if (numTokens == 1) { String term = null; try { boolean hasNext; hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (final IOException e) { // safe to ignore, because we know the number of tokens } fieldNode.setText(term); return fieldNode; } else { // no phrase query: final LinkedList<QueryNode> children = new LinkedList<QueryNode>(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; final int positionIncrement = 1; try { final boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (final IOException e) { // safe to ignore, because we know the number of tokens } final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.setPositionIncrement(position); } else { newFieldNode.setPositionIncrement(i); } children.add(new FieldQueryNode(field, term, -1, -1)); } if (node.getParent() instanceof TokenizedPhraseQueryNode) { throw new QueryNodeException(new MessageImpl("Cannot build a MultiPhraseQuery")); } // If multiple terms at one single position, this must be a query // expansion. Perform a OR between the terms. if (severalTokensAtSamePosition && positionCount == 1) { return new GroupQueryNode(new OrQueryNode(children)); } // if several tokens at same position && position count > 1, then // results can be unexpected else { final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); for (int i = 0; i < children.size(); i++) { pq.add(children.get(i)); } return pq; } } } else if (node instanceof TwigQueryNode) { nbTwigs--; assert nbTwigs >= 0; } return node; }
From source file:org.sindice.siren.qparser.keyword.processors.PhraseQueryNodeProcessor.java
License:Apache License
@Override protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeException { if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof RegexpQueryNode) && !(node.getParent() instanceof RangeQueryNode)) { final FieldQueryNode fieldNode = ((FieldQueryNode) node); final String text = fieldNode.getTextAsString(); final String field = fieldNode.getFieldAsString(); final TokenStream source; try {/*from w w w . j a va 2 s . c o m*/ source = this.analyzer.tokenStream(field, new StringReader(text)); source.reset(); } catch (final IOException e1) { throw new RuntimeException(e1); } final CachingTokenFilter buffer = new CachingTokenFilter(source); int numTokens = 0; try { while (buffer.incrementToken()) { numTokens++; } } catch (final IOException e) { // ignore } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (final IOException e) { // ignore } if (!buffer.hasAttribute(CharTermAttribute.class)) { return new NoTokenFoundQueryNode(); } final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class); if (numTokens == 0) { return new NoTokenFoundQueryNode(); } else if (numTokens != 1) { // phrase query final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); for (int i = 0; i < numTokens; i++) { String term = null; try { final boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (final IOException e) { // safe to ignore, because we know the number of tokens } final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); newFieldNode.setPositionIncrement(i); pq.add(newFieldNode); } return pq; } } return node; }
From source file:org.solbase.lucenehbase.IndexWriter.java
License:Apache License
@SuppressWarnings("unchecked") public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException { // given doc, what are all of terms we indexed List<Term> allIndexedTerms = new ArrayList<Term>(); Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024); // need to hold onto TermDocMetaData, so it can return this array List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>(); byte[] docId = Bytes.toBytes(docNumber); int position = 0; for (Fieldable field : (List<Fieldable>) doc.getFields()) { // Indexed field if (field.isIndexed() && field.isTokenized()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); }//from ww w.ja v a 2 s. c o m // collect term information per field Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>(); int lastOffset = 0; if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } tokens.reset(); // reset the TokenStream to the first token // offsets OffsetAttribute offsetAttribute = null; if (field.isStoreOffsetWithTermVector()) offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); // positions PositionIncrementAttribute posIncrAttribute = null; if (field.isStorePositionWithTermVector()) posIncrAttribute = (PositionIncrementAttribute) tokens .addAttribute(PositionIncrementAttribute.class); TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class); // store normalizations of field per term per document // rather // than per field. // this adds more to write but less to read on other side Integer tokensInField = new Integer(0); while (tokens.incrementToken()) { tokensInField++; Term term = new Term(field.name(), termAttribute.term()); allIndexedTerms.add(term); // fetch all collected information for this term Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term); if (termInfo == null) { termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>(); allTermInformation.put(term, termInfo); } // term frequency List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes); if (termFrequency == null) { termFrequency = new ArrayList<Number>(); termFrequency.add(new Integer(0)); termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency); } // increment termFrequency.set(0, termFrequency.get(0).intValue() + 1); // position vector if (field.isStorePositionWithTermVector()) { position += (posIncrAttribute.getPositionIncrement() - 1); List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes); if (positionVector == null) { positionVector = new ArrayList<Number>(); termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector); } positionVector.add(++position); } // term offsets if (field.isStoreOffsetWithTermVector()) { List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes); if (offsetVector == null) { offsetVector = new ArrayList<Number>(); termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector); } offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset()); } List<Number> sortValues = new ArrayList<Number>(); // init sortValues for (int i = 0; i < Scorer.numSort; i++) { sortValues.add(new Integer(-1)); } int order = 0; // extract sort field value and store it in term doc metadata obj for (String fieldName : sortFieldNames) { Fieldable fieldable = doc.getFieldable(fieldName); if (fieldable instanceof EmbeddedSortField) { EmbeddedSortField sortField = (EmbeddedSortField) fieldable; int value = -1; if (sortField.stringValue() != null) { value = Integer.parseInt(sortField.stringValue()); } int sortSlot = sortField.getSortSlot(); sortValues.set(sortSlot - 1, new Integer(value)); } else { // TODO: this logic is used for real time indexing. // hacky. depending on order of sort field names in array int value = -1; if (fieldable.stringValue() != null) { value = Integer.parseInt(fieldable.stringValue()); } sortValues.set(order++, new Integer(value)); } } termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues); } List<Number> bnorm = null; if (!field.getOmitNorms()) { bnorm = new ArrayList<Number>(); float norm = doc.getBoost(); norm *= field.getBoost(); norm *= similarity.lengthNorm(field.name(), tokensInField); bnorm.add(Similarity.encodeNorm(norm)); } for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) { Term tempTerm = term.getKey(); byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm); // Mix in the norm for this field alongside each term // more writes but faster on read side. if (!field.getOmitNorms()) { term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm); } TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm); metadatas.add(data); } } // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { Term term = new Term(field.name(), field.stringValue()); allIndexedTerms.add(term); byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term); Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>(); termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {})); termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {})); TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term); metadatas.add(data); } // Stores each field as a column under this doc key if (field.isStored()) { byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue()); // first byte flags if binary or not byte[] value = new byte[_value.length + 1]; System.arraycopy(_value, 0, value, 0, _value.length); value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE); // logic to handle multiple fields w/ same name byte[] currentValue = fieldCache.get(field.name()); if (currentValue == null) { fieldCache.put(field.name(), value); } else { // append new data byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1]; System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1); System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length); System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length); fieldCache.put(field.name(), newValue); } } } Put documentPut = new Put(SolbaseUtil.randomize(docNumber)); // Store each field as a column under this docId for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) { documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue()); } // in case of real time update, we need to add back docId field if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) { byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString()); // first byte flags if binary or not byte[] value = new byte[docIdStr.length + 1]; System.arraycopy(docIdStr, 0, value, 0, docIdStr.length); value[value.length - 1] = (byte) (Byte.MIN_VALUE); documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value); } // Finally, Store meta-data so we can delete this document documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array()); ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms); return parsedDoc; }
From source file:org.splevo.vpm.analyzer.semantic.lucene.LuceneCodeAnalyzer.java
License:Open Source License
/** * Stem a list of words with a configured stemmer. * * @param words/*from www .j av a 2 s .c o m*/ * The list of words to stem. * @param stemming * The stemmer to be used. * @return The stemmed list of words. */ @SuppressWarnings("resource") public static String[] stemWords(String[] words, Stemming stemming) { Set<String> stemmedStopWords = Sets.newHashSet(); for (String word : words) { TokenStream tokenStream = new StandardTokenizer(LUCENE_VERSION, new StringReader(word)); tokenStream = Stemming.wrapStemmingFilter(tokenStream, stemming); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); stemmedStopWords.add(term); } } catch (IOException e) { logger.error("Failed to stem a list of words", e); } } return stemmedStopWords.toArray(new String[] {}); }
From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java
License:Apache License
private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, s); CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { set.add(cattr.toString());//from w ww . j a v a 2 s .c om } ts.end(); ts.close(); }
From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java
License:Apache License
private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue, TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException { //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true); TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue); stream.reset(); int defaultInc = 1; CharTermAttribute termAtt = stream/* w ww . j ava 2 s . c om*/ .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); OffsetAttribute offsetAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); PositionIncrementAttribute incAtt = null; if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) { incAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); } while (stream.incrementToken()) { //Do we need this? if (incAtt != null && incAtt.getPositionIncrement() == 0) { continue; } currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc; if (requests.contains(currInd)) { results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase, termAtt.toString()); } if (currInd > requests.getLast()) { // TODO: Is there a way to avoid this? Or, is this // an imaginary performance hit? while (stream.incrementToken()) { //NO-OP } stream.end(); stream.close(); return GOT_ALL_REQUESTS; } } stream.end(); stream.close(); return currInd; }
From source file:org.tallison.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil.java
License:Apache License
/** * allows reuse of terms, this method calls terms.clear() before adding new * terms/*from w w w .j av a 2 s . c om*/ * * @param s string to analyze * @param field to use in analysis * @param analyzer analyzer * @param terms list for reuse * @return list of strings * @throws java.io.IOException if there's an IOException during analysis */ public static List<String> getTermStrings(String s, String field, Analyzer analyzer, List<String> terms) throws IOException { if (terms == null) { terms = new ArrayList<>(); } terms.clear(); TokenStream stream = analyzer.tokenStream(field, s); stream.reset(); CharTermAttribute termAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); while (stream.incrementToken()) { terms.add(termAtt.toString()); } stream.end(); stream.close(); return terms; }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testBasicNoUnigrams() throws Exception { Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", }); List<String> returned = new ArrayList<>(); while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); assertEquals(1, posIncAttribute.getPositionIncrement()); returned.add(token);//from ww w. j a v a 2 s.c om } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testIncludeUnigrams() throws Exception { List<String> expected = Arrays.asList( new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", }); Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream("f", s); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> returned = new ArrayList<>(); int i = 0;/*from w w w . ja v a 2 s.co m*/ while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); if (i++ % 2 == 0) { assertEquals(1, posIncAttribute.getPositionIncrement()); } else { assertEquals(0, posIncAttribute.getPositionIncrement()); } returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testCJKNoUnigrams() throws Exception { final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET; int posIncGap = 10; final int charOffsetGap = 10; Analyzer analyzer = getCJKBigramAnalyzer(false); TokenStream ts = analyzer.tokenStream(FIELD, ""); ts.reset(); CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class); ts.end();/*from w w w .j a va 2s . co m*/ ts.close(); String[] docs = new String[] { "" }; Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD))); Query q = new TermQuery(new Term(FIELD, "")); //now test straight and span wrapper ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, q, analyzer, collector); for (ConcordanceWindow w : collector.getWindows()) { //System.out.println(w); } reader.close(); directory.close(); }