List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.nuxeo.ecm.platform.categorization.categorizer.tfidf.TfIdfCategorizer.java
License:Open Source License
public List<String> tokenize(String textContent) { try {/*from w w w. j a va 2s . co m*/ List<String> terms = new ArrayList<String>(); TokenStream tokenStream = getAnalyzer().tokenStream(null, textContent); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { terms.add(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); return terms; } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.olat.search.ui.SearchInputController.java
License:Apache License
protected Set<String> getHighlightWords(final String searchString) { try {/*from w ww .ja v a 2 s . c o m*/ final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); final TokenStream stream = analyzer.tokenStream("content", new StringReader(searchString)); final TermAttribute termAtt = stream.addAttribute(TermAttribute.class); for (boolean next = stream.incrementToken(); next; next = stream.incrementToken()) { final String term = termAtt.term(); if (log.isDebug()) { log.debug(term); } } } catch (final IOException e) { log.error("", e); } return null; }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed, final boolean filterDigits, final boolean filterWhitespace) { if (StringUtil.isEmpty(strOrig)) { return EMPTY_TOKENS_LIST; }//from w ww . j a v a2s. c om List<Token> result = new ArrayList<Token>(64); final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset())); } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new Token[result.size()]); }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed, boolean filterDigits, boolean filterWhitespace) { if (StringUtil.isEmpty(str)) { return EMPTY_STRING_LIST; }//from www.j a v a 2 s.co m List<String> result = new ArrayList<String>(64); final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); Locale loc = stemsAllowed ? getLanguage().getLocale() : null; try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(tokenText); if (stemsAllowed) { String origText = str.substring(off.startOffset(), off.endOffset()); if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) { result.add(origText); } } } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new String[result.size()]); }
From source file:org.opencloudengine.flamingo.mapreduce.util.Lucene4Utils.java
License:Apache License
public static List<String> tokenizeString(Analyzer analyzer, String string) throws IOException { List<String> result = new ArrayList<String>(); TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(string)); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); result.add(charTermAttribute.toString()); }/*w ww . j a va2 s . com*/ return result; }
From source file:org.opengrok.web.api.v1.suggester.query.SuggesterQueryParser.java
License:Open Source License
private static List<String> getAllTokens(final Analyzer analyzer, final String field, final String text) { List<String> tokens = new LinkedList<>(); TokenStream ts = null; try {// ww w. j a v a 2 s .c o m ts = analyzer.tokenStream(field, text); CharTermAttribute attr = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { tokens.add(attr.toString()); } } catch (IOException e) { logger.log(Level.WARNING, "Could not analyze query text", e); } finally { try { if (ts != null) { ts.end(); ts.close(); } } catch (IOException e) { logger.log(Level.WARNING, "Could not close token stream", e); } } return tokens; }
From source file:org.opensextant.solrtexttagger.Tagger.java
License:Open Source License
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream, TagClusterReducer tagClusterReducer, boolean skipAltTokens, boolean ignoreStopWords) throws IOException { this.terms = terms; this.liveDocs = liveDocs; this.tokenStream = tokenStream; this.skipAltTokens = skipAltTokens; this.ignoreStopWords = ignoreStopWords; // termAtt = tokenStream.addAttribute(CharTermAttribute.class); byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class); posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); lookupAtt = tokenStream.addAttribute(TaggingAttribute.class); tokenStream.reset();/*ww w . j a va 2s . com*/ this.tagClusterReducer = tagClusterReducer; }
From source file:org.pageseeder.flint.lucene.query.Queries.java
License:Apache License
/** * Returns the terms for a field// w w w .j a va 2s . c o m * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) { try { TokenStream stream = analyzer.tokenStream(field, text); PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { position += increment.getPositionIncrement(); Term term = new Term(field, attribute.toString()); phrase.add(term, position); } stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } }
From source file:org.pageseeder.flint.lucene.search.Fields.java
License:Apache License
/** * Returns the terms for a field//from w w w.j a v a2 s . c o m * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ public static List<String> toTerms(String field, String text, Analyzer analyzer) { List<String> terms = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = attribute.toString(); terms.add(term); } stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } return terms; }
From source file:org.solbase.lucenehbase.IndexWriter.java
License:Apache License
@SuppressWarnings("unchecked") public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException { // given doc, what are all of terms we indexed List<Term> allIndexedTerms = new ArrayList<Term>(); Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024); // need to hold onto TermDocMetaData, so it can return this array List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>(); byte[] docId = Bytes.toBytes(docNumber); int position = 0; for (Fieldable field : (List<Fieldable>) doc.getFields()) { // Indexed field if (field.isIndexed() && field.isTokenized()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); }/*from www .ja va 2s . com*/ // collect term information per field Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>(); int lastOffset = 0; if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } tokens.reset(); // reset the TokenStream to the first token // offsets OffsetAttribute offsetAttribute = null; if (field.isStoreOffsetWithTermVector()) offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); // positions PositionIncrementAttribute posIncrAttribute = null; if (field.isStorePositionWithTermVector()) posIncrAttribute = (PositionIncrementAttribute) tokens .addAttribute(PositionIncrementAttribute.class); TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class); // store normalizations of field per term per document // rather // than per field. // this adds more to write but less to read on other side Integer tokensInField = new Integer(0); while (tokens.incrementToken()) { tokensInField++; Term term = new Term(field.name(), termAttribute.term()); allIndexedTerms.add(term); // fetch all collected information for this term Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term); if (termInfo == null) { termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>(); allTermInformation.put(term, termInfo); } // term frequency List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes); if (termFrequency == null) { termFrequency = new ArrayList<Number>(); termFrequency.add(new Integer(0)); termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency); } // increment termFrequency.set(0, termFrequency.get(0).intValue() + 1); // position vector if (field.isStorePositionWithTermVector()) { position += (posIncrAttribute.getPositionIncrement() - 1); List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes); if (positionVector == null) { positionVector = new ArrayList<Number>(); termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector); } positionVector.add(++position); } // term offsets if (field.isStoreOffsetWithTermVector()) { List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes); if (offsetVector == null) { offsetVector = new ArrayList<Number>(); termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector); } offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset()); } List<Number> sortValues = new ArrayList<Number>(); // init sortValues for (int i = 0; i < Scorer.numSort; i++) { sortValues.add(new Integer(-1)); } int order = 0; // extract sort field value and store it in term doc metadata obj for (String fieldName : sortFieldNames) { Fieldable fieldable = doc.getFieldable(fieldName); if (fieldable instanceof EmbeddedSortField) { EmbeddedSortField sortField = (EmbeddedSortField) fieldable; int value = -1; if (sortField.stringValue() != null) { value = Integer.parseInt(sortField.stringValue()); } int sortSlot = sortField.getSortSlot(); sortValues.set(sortSlot - 1, new Integer(value)); } else { // TODO: this logic is used for real time indexing. // hacky. depending on order of sort field names in array int value = -1; if (fieldable.stringValue() != null) { value = Integer.parseInt(fieldable.stringValue()); } sortValues.set(order++, new Integer(value)); } } termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues); } List<Number> bnorm = null; if (!field.getOmitNorms()) { bnorm = new ArrayList<Number>(); float norm = doc.getBoost(); norm *= field.getBoost(); norm *= similarity.lengthNorm(field.name(), tokensInField); bnorm.add(Similarity.encodeNorm(norm)); } for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) { Term tempTerm = term.getKey(); byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm); // Mix in the norm for this field alongside each term // more writes but faster on read side. if (!field.getOmitNorms()) { term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm); } TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm); metadatas.add(data); } } // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { Term term = new Term(field.name(), field.stringValue()); allIndexedTerms.add(term); byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term); Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>(); termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {})); termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {})); TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term); metadatas.add(data); } // Stores each field as a column under this doc key if (field.isStored()) { byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue()); // first byte flags if binary or not byte[] value = new byte[_value.length + 1]; System.arraycopy(_value, 0, value, 0, _value.length); value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE); // logic to handle multiple fields w/ same name byte[] currentValue = fieldCache.get(field.name()); if (currentValue == null) { fieldCache.put(field.name(), value); } else { // append new data byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1]; System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1); System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length); System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length); fieldCache.put(field.name(), newValue); } } } Put documentPut = new Put(SolbaseUtil.randomize(docNumber)); // Store each field as a column under this docId for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) { documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue()); } // in case of real time update, we need to add back docId field if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) { byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString()); // first byte flags if binary or not byte[] value = new byte[docIdStr.length + 1]; System.arraycopy(docIdStr, 0, value, 0, docIdStr.length); value[value.length - 1] = (byte) (Byte.MIN_VALUE); documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value); } // Finally, Store meta-data so we can delete this document documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array()); ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms); return parsedDoc; }