List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:com.bigdata.search.AbstractSearchTest.java
License:Open Source License
protected String getTokenStream(Analyzer a, String text) throws IOException { StringBuffer sb = new StringBuffer(); TokenStream s = a.tokenStream(null, new StringReader(text)); while (s.incrementToken()) { final TermAttribute term = s.getAttribute(TermAttribute.class); if (sb.length() != 0) { sb.append(" "); }/* w ww . j a v a2s . com*/ sb.append(term.term()); } return sb.toString(); }
From source file:com.bigdata.search.AbstractSearchTest.java
License:Open Source License
private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException { TokenStream s = a.tokenStream(null, new StringReader(text)); int ix = 0;/*from ww w . j a v a 2s . c o m*/ while (s.incrementToken()) { final TermAttribute term = s.getAttribute(TermAttribute.class); final String word = term.term(); assertTrue(ix < expected.length); assertEquals(expected[ix++], word); } assertEquals(ix, expected.length); }
From source file:com.bigdata.search.FullTextIndex.java
License:Open Source License
/** * Index a field in a document./*from w w w .j a v a2 s .co m*/ * <p> * Note: This method does NOT force a write on the indices. If the <i>buffer</i> * overflows, then there will be an index write. Once the caller is done * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data * remaining in their <i>buffer</i> to the indices. * <p> * Note: If a document is pre-existing, then the existing data for that * document MUST be removed unless you know that the fields to be found in * the will not have changed (they may have different contents, but the same * fields exist in the old and new versions of the document). * * @param buffer * Used to buffer writes onto the text index. * @param docId * The document identifier. * @param fieldId * The field identifier. * @param languageCode * The language code -or- <code>null</code> to use the default * {@link Locale}. * @param r * A reader on the text to be indexed. * @param filterStopwords * if true, filter stopwords from the token stream * * @see TokenBuffer#flush() */ public void index(final TokenBuffer<V> buffer, final V docId, final int fieldId, final String languageCode, final Reader r, final boolean filterStopwords) { /* * Note: You can invoke this on a read-only index. It is only overflow * of the TokenBuffer that requires a writable index. Overflow itself * will only occur on {document,field} tuple boundaries, so it will * never overflow when indexing a search query. */ // assertWritable(); int n = 0; // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be). final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords); try { while (tokenStream.incrementToken()) { final TermAttribute term = tokenStream.getAttribute(TermAttribute.class); buffer.add(docId, fieldId, term.term()); n++; } } catch (IOException ioe) { throw new RuntimeException(ioe); } if (log.isInfoEnabled()) log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId=" + fieldId); }
From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java
License:Apache License
private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException { if (null == ts) return;//w ww . j av a 2 s . com TokenStream stream = ts.stream; if (null == stream) return; DocTerms terms = doc.terms; if (null == doc.terms) { terms = new DocTerms(); doc.terms = terms; } String token = null; int offset = 0; CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { token = termA.toString(); offset = offsetA.startOffset(); Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset); terms.getTermList().add(term); } stream.close(); }
From source file:com.bizosys.hsearch.inpipe.util.StemFilterWrap.java
License:Apache License
public StemFilterWrap(TokenStream in) { super(in);//from w w w . j av a2 s .com stemmer = Stemmer.getInstance(); this.termA = (CharTermAttribute) in.getAttribute(CharTermAttribute.class); ; }
From source file:com.bizosys.hsearch.kv.impl.StorageReader.java
License:Apache License
/** * Returns the ids for analyzed field that is not repeatable. * @param checkForAllWords// w w w . ja v a2 s . c om * @param fieldName * @return * @throws IOException */ private final BitSetOrSet readStorageTextIdsSet(final boolean checkForAllWords, final String fieldName, String fieldQuery) throws IOException { StringBuilder sb = new StringBuilder(); String docType = "*"; String fieldType = fieldName; String wordHash = null; int hash = 0; BitSetOrSet destination = new BitSetOrSet(); boolean isVirgin = true; String currentRowId = null; String mergeid = rowId.substring(0, rowId.lastIndexOf('_')); int fieldTypeLoc = fieldName.indexOf('/'); if (fieldTypeLoc > 0) { docType = fieldName.substring(0, fieldTypeLoc); fieldType = fieldName.substring(fieldTypeLoc + 1); } byte[] dataChunk = null; try { Map<String, Integer> dTypes = new HashMap<String, Integer>(1); dTypes.put(docType, 1); setDocumentTypeCodes(dTypes); Map<String, Integer> fTypes = new HashMap<String, Integer>(1); fTypes.put(fieldType, 1); setFieldTypeCodes(fTypes); Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName); TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery)); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); Set<String> terms = new LinkedHashSet<String>(); while (stream.incrementToken()) { terms.add(termAttribute.toString()); } String docTypeCode = "*".equals(docType) ? "*" : new Integer(DocumentTypeCodes.getInstance().getCode(docType)).toString(); String fldTypeCode = "*".equals(fieldType) ? "*" : new Integer(FieldTypeCodes.getInstance().getCode(fieldType)).toString(); for (String term : terms) { if (DEBUG_ENABLED) { IdSearchLog.l.debug("Finding Term :" + term); } hash = Hashing.hash(term); wordHash = new Integer(hash).toString(); sb.delete(0, sb.length()); fieldQuery = sb.append(docTypeCode).append('|').append(fldTypeCode).append('|').append('*') .append('|').append(hash).append('|').append("*|*").toString(); sb.delete(0, sb.length()); currentRowId = mergeid + "_" + wordHash.charAt(0) + "_" + wordHash.charAt(wordHash.length() - 1); ComputeKV compute = new ComputeKV(); compute.kvType = (instruction.getOutputType() == Datatype.FREQUENCY_INDEX) ? Datatype.STRING : instruction.getOutputType(); compute.kvRepeatation = instruction.getProcessingHint().startsWith("true"); compute.isCompressed = instruction.getProcessingHint().endsWith("true"); byte[] data = KvRowReaderFactory.getInstance().getReader(this.isCachable).readStoredProcedureBlob( tableName, currentRowId.getBytes(), compute, null, null, filterQuery, instruction); Collection<byte[]> dataL = SortedBytesArray.getInstanceArr().parse(data).values(); int size = (null == dataL) ? 0 : dataL.size(); if (checkForAllWords) { if (size > 0) { dataChunk = dataL.isEmpty() ? null : dataL.iterator().next(); if (dataChunk == null) { destination.clear(); break; } } else { destination.clear(); break; } BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0, dataChunk.length); if (isVirgin) { destination.setDocumentSequences(bitSets); isVirgin = false; continue; } BitSetOrSet source = new BitSetOrSet(); source.setDocumentSequences(bitSets); destination.and(source); } else { if (size == 0) continue; dataChunk = dataL.isEmpty() ? null : dataL.iterator().next(); if (dataChunk == null) continue; BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0, dataChunk.length); if (isVirgin) { destination.setDocumentSequences(bitSets); isVirgin = false; continue; } else { BitSetOrSet source = new BitSetOrSet(); source.setDocumentSequences(bitSets); destination.or(source); } } } return destination; } catch (Exception e) { String msg = "Error while processing query [" + fieldQuery + "]\n"; msg = msg + "Found Data Chunk\t" + ((null == dataChunk) ? "None" : new String(dataChunk)); IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg); e.printStackTrace(); throw new IOException(msg, e); } }
From source file:com.bizosys.hsearch.kv.impl.StorageReader.java
License:Apache License
/** * Returns the ids for analyzed field that is repeatable. * @param checkForAllWords/*w ww.jav a 2s. c om*/ * @param biWord * @param triWord * @param isCompressed * @param isCached * @param fieldName * @param enableNGram * @return * @throws IOException */ private final BitSetOrSet readStorageTextIdsBitset(final boolean checkForAllWords, final String fieldQuery, final boolean biWord, final boolean triWord, boolean isCompressed, boolean isCached, final String fieldName, boolean enableNGram, boolean checkExactPhrase) throws IOException { BitSetOrSet destination = new BitSetOrSet(); String rowIdPrefix = rowId; try { Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName); TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery)); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); Set<String> terms = new LinkedHashSet<String>(); while (stream.incrementToken()) { terms.add(termAttribute.toString()); } int termsT = terms.size(); if (enableNGram) { if (DEBUG_ENABLED) IdSearchLog.l.debug("NGRam Explosion"); int subsequenceLen = 1; if (biWord) subsequenceLen = 2; else if (triWord) subsequenceLen = 3; /** * There may be an pentalty on performance. * Don't allow total search phrases > 10 */ if (triWord && (termsT > 4)) subsequenceLen = 2; if ((subsequenceLen == 2) && (termsT > 5)) subsequenceLen = 1; /** * "red party gown" * "party gown dress" * "red party" * "party gown" * "gown dress" * "red" * "party" * "gown" * "dress" */ List<String> phrases = new ArrayList<String>(); StringBuilder sb = new StringBuilder(1024); String[] termsA = new String[terms.size()]; terms.toArray(termsA); for (int subSequence = subsequenceLen; subSequence > 0; subSequence--) { if (subSequence <= 0) break; for (int wordPosition = 0; wordPosition <= termsT - subSequence; wordPosition++) { for (int pos = 0; pos < subSequence; pos++) { if (pos > 0) sb.append(' '); sb.append(termsA[wordPosition + pos]); } phrases.add(sb.toString()); sb.setLength(0); } } for (String phrase : phrases) { BitSetOrSet phraseMatches = new BitSetOrSet(); findATerm(checkForAllWords, isCompressed, isCached, phraseMatches, rowIdPrefix, phrase, false); destination.orQueryWithFoundIds.put(phrase, phraseMatches); destination.or(phraseMatches); } if (DEBUG_ENABLED) IdSearchLog.l.debug("NGram Query OR trace > " + destination.orQueryWithFoundIds.toString()); return destination; } else { if (DEBUG_ENABLED) IdSearchLog.l.debug("Normal Query processing"); //check for all words BitSetOrSet highRanked = null; switch (termsT) { case 2: { /** * All 2 words are consecutive */ if (biWord) { Iterator<String> itr = terms.iterator(); String phrase = itr.next() + " " + itr.next(); findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true); BitSetWrapper result = destination.getDocumentSequences(); int resultT = (null == result) ? 0 : result.cardinality(); if (resultT > 0 || checkExactPhrase) return destination; } /* * Biword search result is 0 so search for all words. */ return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination, rowIdPrefix); } case 3: { /** * All 3 words are consecutive */ Iterator<String> itr = terms.iterator(); String word1 = itr.next(); String word2 = itr.next(); String word3 = itr.next(); if (triWord) { String phrase = word1 + " " + word2 + " " + word3; findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true); BitSetWrapper result = destination.getDocumentSequences(); int resultT = (null == result) ? 0 : result.cardinality(); if (resultT > 0 || checkExactPhrase) return destination; } /** * If Check for all words is true minimum required result is 1 for three words * else minimum required result is 0 */ int requiredMinResult = checkForAllWords ? 1 : 0; /** * 2 words are consecutive, take them and apply findAll on them */ if (biWord) { String biword1 = word1 + " " + word2; String biword2 = word2 + " " + word3; String biword3 = word1 + " " + word3; highRanked = new BitSetOrSet(); String[] phrases = new String[] { biword1, biword2, biword3 }; int found = 0; for (String phrase : phrases) { int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase, false); if (result > 0) found++; } if (found > requiredMinResult || checkExactPhrase) return highRanked; } /* * Biword and Triword search result is 0 so search for all words. */ return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination, rowIdPrefix); } case 4: { Iterator<String> itr = terms.iterator(); String word1 = itr.next(); String word2 = itr.next(); String word3 = itr.next(); String word4 = itr.next(); int requiredMinResult = 0; if (triWord) { requiredMinResult = checkForAllWords ? 1 : 0; String triword1 = word1 + " " + word2 + " " + word3; String triword2 = word1 + " " + word3 + " " + word4; String triword3 = word2 + " " + word3 + " " + word4; highRanked = new BitSetOrSet(); String[] phrases = new String[] { triword1, triword2, triword3 }; int found = 0; for (String phrase : phrases) { int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase, false); if (result > 0) found++; } if (found > requiredMinResult || checkExactPhrase) return highRanked; } if (biWord) { requiredMinResult = checkForAllWords ? 2 : 0; String biword1 = word1 + " " + word2; String biword2 = word1 + " " + word3; String biword3 = word1 + " " + word4; String biword4 = word2 + " " + word3; String biword5 = word2 + " " + word4; highRanked = new BitSetOrSet(); String[] phrases = new String[] { biword1, biword2, biword3, biword4, biword5 }; int found = 0; for (String phrase : phrases) { int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase, false); if (result > 0) found++; } if (found > requiredMinResult || checkExactPhrase) return highRanked; } /* * Biword and Triword search result is 0 so search for all words. */ return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination, rowIdPrefix); } default: { /* * Biword and Triword is not enabled so search for all words. */ return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination, rowIdPrefix); } } } } catch (Exception e) { String msg = "Error while processing query [" + fieldQuery + "]\n"; IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg); e.printStackTrace(); throw new IOException(msg, e); } }
From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) public void mapFreeTextSet(Field fld, Context context) throws IOException, InterruptedException { terms.clear();/*w w w . ja v a2 s. co m*/ CharTermAttribute termAttribute = null; TokenStream stream = null; int wordhash; String wordhashStr; char firstChar; char lastChar; try { if (isFieldNull) return; Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name); stream = analyzer.tokenStream(fld.name, new StringReader(fldValue)); termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String termWord = termAttribute.toString(); wordhash = Hashing.hash(termWord); wordhashStr = new Integer(wordhash).toString(); firstChar = wordhashStr.charAt(0); lastChar = wordhashStr.charAt(wordhashStr.length() - 1); rowKeyP1 = mergeId + "_" + firstChar + "_" + lastChar; appender.delete(0, appender.capacity()); rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); appender.delete(0, appender.capacity()); rowVal = appender.append(incrementalIdSeekPosition).append(KVIndexer.FIELD_SEPARATOR) .append(wordhash).toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) public final void mapFreeTextBitset(final Field fld, final Context context) throws IOException, InterruptedException { terms.clear();//from w ww.ja va2s .co m CharTermAttribute termAttribute = null; TokenStream stream = null; try { if (isFieldNull) return; Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name); stream = analyzer.tokenStream(fld.name, new StringReader(fldValue)); termAttribute = stream.getAttribute(CharTermAttribute.class); String last2 = null; String last1 = null; while (stream.incrementToken()) { String termWord = termAttribute.toString(); if (0 == termWord.length()) continue; appender.delete(0, appender.capacity()); /** * Row Key is mergeidFIELDwordhashStr */ boolean isEmpty = (null == mergeId) ? true : (mergeId.length() == 0); String rowKeyPrefix = (isEmpty) ? fld.name : mergeId + "_" + fld.name; rowKeyP1 = rowKeyPrefix + termWord; rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); appender.setLength(0); rowVal = appender.append(incrementalIdSeekPosition).toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); if (!fld.isBiWord && !fld.isTriWord) continue; /** * Do Three phrase word */ if (null != last2) { appender.setLength(0); rowKeyP1 = appender.append(rowKeyPrefix).append(last2).append(' ').append(last1).append(' ') .append(termWord).append(' ').append('*').toString(); appender.setLength(0); rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); } /** * Do Two phrase word */ if (null != last1) { appender.setLength(0); rowKeyP1 = appender.append(rowKeyPrefix).append(last1).append(' ').append(termWord).append(' ') .append('*').toString(); appender.setLength(0); rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); } last2 = last1; last1 = termWord; } } catch (Exception e) { e.printStackTrace(); System.err.println("Error While tokenizing : " + e.getMessage()); } finally { try { if (null != stream) stream.close(); } catch (Exception ex) { IdSearchLog.l.warn("Error during Tokenizer Stream closure"); } } }
From source file:com.bizosys.unstructured.CustomAnalyzerExample.java
License:Apache License
public static void main(String[] args) throws Exception { Document doc = new Document(); doc.add(new Field("description", "Abinash", Field.Store.NO, Field.Index.ANALYZED)); Analyzer analyzer = new CustomAnalyzerExample(); for (Fieldable field : doc.getFields()) { StringReader sr = new StringReader(field.stringValue()); TokenStream stream = analyzer.tokenStream(field.name(), sr); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { System.out.println(termA.toString()); }/*from www .j a v a 2s.c o m*/ sr.close(); } }