List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:com.bizosys.hsearch.kv.impl.StorageReader.java
License:Apache License
/** * Returns the ids for analyzed field that is not repeatable. * @param checkForAllWords//from w ww.j a v a2 s .c o m * @param fieldName * @return * @throws IOException */ private final BitSetOrSet readStorageTextIdsSet(final boolean checkForAllWords, final String fieldName, String fieldQuery) throws IOException { StringBuilder sb = new StringBuilder(); String docType = "*"; String fieldType = fieldName; String wordHash = null; int hash = 0; BitSetOrSet destination = new BitSetOrSet(); boolean isVirgin = true; String currentRowId = null; String mergeid = rowId.substring(0, rowId.lastIndexOf('_')); int fieldTypeLoc = fieldName.indexOf('/'); if (fieldTypeLoc > 0) { docType = fieldName.substring(0, fieldTypeLoc); fieldType = fieldName.substring(fieldTypeLoc + 1); } byte[] dataChunk = null; try { Map<String, Integer> dTypes = new HashMap<String, Integer>(1); dTypes.put(docType, 1); setDocumentTypeCodes(dTypes); Map<String, Integer> fTypes = new HashMap<String, Integer>(1); fTypes.put(fieldType, 1); setFieldTypeCodes(fTypes); Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName); TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery)); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); Set<String> terms = new LinkedHashSet<String>(); while (stream.incrementToken()) { terms.add(termAttribute.toString()); } String docTypeCode = "*".equals(docType) ? "*" : new Integer(DocumentTypeCodes.getInstance().getCode(docType)).toString(); String fldTypeCode = "*".equals(fieldType) ? "*" : new Integer(FieldTypeCodes.getInstance().getCode(fieldType)).toString(); for (String term : terms) { if (DEBUG_ENABLED) { IdSearchLog.l.debug("Finding Term :" + term); } hash = Hashing.hash(term); wordHash = new Integer(hash).toString(); sb.delete(0, sb.length()); fieldQuery = sb.append(docTypeCode).append('|').append(fldTypeCode).append('|').append('*') .append('|').append(hash).append('|').append("*|*").toString(); sb.delete(0, sb.length()); currentRowId = mergeid + "_" + wordHash.charAt(0) + "_" + wordHash.charAt(wordHash.length() - 1); ComputeKV compute = new ComputeKV(); compute.kvType = (instruction.getOutputType() == Datatype.FREQUENCY_INDEX) ? Datatype.STRING : instruction.getOutputType(); compute.kvRepeatation = instruction.getProcessingHint().startsWith("true"); compute.isCompressed = instruction.getProcessingHint().endsWith("true"); byte[] data = KvRowReaderFactory.getInstance().getReader(this.isCachable).readStoredProcedureBlob( tableName, currentRowId.getBytes(), compute, null, null, filterQuery, instruction); Collection<byte[]> dataL = SortedBytesArray.getInstanceArr().parse(data).values(); int size = (null == dataL) ? 0 : dataL.size(); if (checkForAllWords) { if (size > 0) { dataChunk = dataL.isEmpty() ? null : dataL.iterator().next(); if (dataChunk == null) { destination.clear(); break; } } else { destination.clear(); break; } BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0, dataChunk.length); if (isVirgin) { destination.setDocumentSequences(bitSets); isVirgin = false; continue; } BitSetOrSet source = new BitSetOrSet(); source.setDocumentSequences(bitSets); destination.and(source); } else { if (size == 0) continue; dataChunk = dataL.isEmpty() ? null : dataL.iterator().next(); if (dataChunk == null) continue; BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0, dataChunk.length); if (isVirgin) { destination.setDocumentSequences(bitSets); isVirgin = false; continue; } else { BitSetOrSet source = new BitSetOrSet(); source.setDocumentSequences(bitSets); destination.or(source); } } } return destination; } catch (Exception e) { String msg = "Error while processing query [" + fieldQuery + "]\n"; msg = msg + "Found Data Chunk\t" + ((null == dataChunk) ? "None" : new String(dataChunk)); IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg); e.printStackTrace(); throw new IOException(msg, e); } }
From source file:com.bizosys.hsearch.kv.impl.StorageReader.java
License:Apache License
/** * Returns the ids for analyzed field that is repeatable. * @param checkForAllWords//from w ww .ja va 2 s . c o m * @param biWord * @param triWord * @param isCompressed * @param isCached * @param fieldName * @param enableNGram * @return * @throws IOException */ private final BitSetOrSet readStorageTextIdsBitset(final boolean checkForAllWords, final String fieldQuery, final boolean biWord, final boolean triWord, boolean isCompressed, boolean isCached, final String fieldName, boolean enableNGram, boolean checkExactPhrase) throws IOException { BitSetOrSet destination = new BitSetOrSet(); String rowIdPrefix = rowId; try { Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName); TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery)); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); Set<String> terms = new LinkedHashSet<String>(); while (stream.incrementToken()) { terms.add(termAttribute.toString()); } int termsT = terms.size(); if (enableNGram) { if (DEBUG_ENABLED) IdSearchLog.l.debug("NGRam Explosion"); int subsequenceLen = 1; if (biWord) subsequenceLen = 2; else if (triWord) subsequenceLen = 3; /** * There may be an pentalty on performance. * Don't allow total search phrases > 10 */ if (triWord && (termsT > 4)) subsequenceLen = 2; if ((subsequenceLen == 2) && (termsT > 5)) subsequenceLen = 1; /** * "red party gown" * "party gown dress" * "red party" * "party gown" * "gown dress" * "red" * "party" * "gown" * "dress" */ List<String> phrases = new ArrayList<String>(); StringBuilder sb = new StringBuilder(1024); String[] termsA = new String[terms.size()]; terms.toArray(termsA); for (int subSequence = subsequenceLen; subSequence > 0; subSequence--) { if (subSequence <= 0) break; for (int wordPosition = 0; wordPosition <= termsT - subSequence; wordPosition++) { for (int pos = 0; pos < subSequence; pos++) { if (pos > 0) sb.append(' '); sb.append(termsA[wordPosition + pos]); } phrases.add(sb.toString()); sb.setLength(0); } } for (String phrase : phrases) { BitSetOrSet phraseMatches = new BitSetOrSet(); findATerm(checkForAllWords, isCompressed, isCached, phraseMatches, rowIdPrefix, phrase, false); destination.orQueryWithFoundIds.put(phrase, phraseMatches); destination.or(phraseMatches); } if (DEBUG_ENABLED) IdSearchLog.l.debug("NGram Query OR trace > " + destination.orQueryWithFoundIds.toString()); return destination; } else { if (DEBUG_ENABLED) IdSearchLog.l.debug("Normal Query processing"); //check for all words BitSetOrSet highRanked = null; switch (termsT) { case 2: { /** * All 2 words are consecutive */ if (biWord) { Iterator<String> itr = terms.iterator(); String phrase = itr.next() + " " + itr.next(); findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true); BitSetWrapper result = destination.getDocumentSequences(); int resultT = (null == result) ? 0 : result.cardinality(); if (resultT > 0 || checkExactPhrase) return destination; } /* * Biword search result is 0 so search for all words. */ return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination, rowIdPrefix); } case 3: { /** * All 3 words are consecutive */ Iterator<String> itr = terms.iterator(); String word1 = itr.next(); String word2 = itr.next(); String word3 = itr.next(); if (triWord) { String phrase = word1 + " " + word2 + " " + word3; findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true); BitSetWrapper result = destination.getDocumentSequences(); int resultT = (null == result) ? 0 : result.cardinality(); if (resultT > 0 || checkExactPhrase) return destination; } /** * If Check for all words is true minimum required result is 1 for three words * else minimum required result is 0 */ int requiredMinResult = checkForAllWords ? 1 : 0; /** * 2 words are consecutive, take them and apply findAll on them */ if (biWord) { String biword1 = word1 + " " + word2; String biword2 = word2 + " " + word3; String biword3 = word1 + " " + word3; highRanked = new BitSetOrSet(); String[] phrases = new String[] { biword1, biword2, biword3 }; int found = 0; for (String phrase : phrases) { int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase, false); if (result > 0) found++; } if (found > requiredMinResult || checkExactPhrase) return highRanked; } /* * Biword and Triword search result is 0 so search for all words. */ return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination, rowIdPrefix); } case 4: { Iterator<String> itr = terms.iterator(); String word1 = itr.next(); String word2 = itr.next(); String word3 = itr.next(); String word4 = itr.next(); int requiredMinResult = 0; if (triWord) { requiredMinResult = checkForAllWords ? 1 : 0; String triword1 = word1 + " " + word2 + " " + word3; String triword2 = word1 + " " + word3 + " " + word4; String triword3 = word2 + " " + word3 + " " + word4; highRanked = new BitSetOrSet(); String[] phrases = new String[] { triword1, triword2, triword3 }; int found = 0; for (String phrase : phrases) { int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase, false); if (result > 0) found++; } if (found > requiredMinResult || checkExactPhrase) return highRanked; } if (biWord) { requiredMinResult = checkForAllWords ? 2 : 0; String biword1 = word1 + " " + word2; String biword2 = word1 + " " + word3; String biword3 = word1 + " " + word4; String biword4 = word2 + " " + word3; String biword5 = word2 + " " + word4; highRanked = new BitSetOrSet(); String[] phrases = new String[] { biword1, biword2, biword3, biword4, biword5 }; int found = 0; for (String phrase : phrases) { int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase, false); if (result > 0) found++; } if (found > requiredMinResult || checkExactPhrase) return highRanked; } /* * Biword and Triword search result is 0 so search for all words. */ return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination, rowIdPrefix); } default: { /* * Biword and Triword is not enabled so search for all words. */ return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination, rowIdPrefix); } } } } catch (Exception e) { String msg = "Error while processing query [" + fieldQuery + "]\n"; IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg); e.printStackTrace(); throw new IOException(msg, e); } }
From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) public void mapFreeTextSet(Field fld, Context context) throws IOException, InterruptedException { terms.clear();/* w w w. j a va 2 s . c om*/ CharTermAttribute termAttribute = null; TokenStream stream = null; int wordhash; String wordhashStr; char firstChar; char lastChar; try { if (isFieldNull) return; Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name); stream = analyzer.tokenStream(fld.name, new StringReader(fldValue)); termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String termWord = termAttribute.toString(); wordhash = Hashing.hash(termWord); wordhashStr = new Integer(wordhash).toString(); firstChar = wordhashStr.charAt(0); lastChar = wordhashStr.charAt(wordhashStr.length() - 1); rowKeyP1 = mergeId + "_" + firstChar + "_" + lastChar; appender.delete(0, appender.capacity()); rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); appender.delete(0, appender.capacity()); rowVal = appender.append(incrementalIdSeekPosition).append(KVIndexer.FIELD_SEPARATOR) .append(wordhash).toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) public final void mapFreeTextBitset(final Field fld, final Context context) throws IOException, InterruptedException { terms.clear();/*from ww w. j av a2 s . com*/ CharTermAttribute termAttribute = null; TokenStream stream = null; try { if (isFieldNull) return; Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name); stream = analyzer.tokenStream(fld.name, new StringReader(fldValue)); termAttribute = stream.getAttribute(CharTermAttribute.class); String last2 = null; String last1 = null; while (stream.incrementToken()) { String termWord = termAttribute.toString(); if (0 == termWord.length()) continue; appender.delete(0, appender.capacity()); /** * Row Key is mergeidFIELDwordhashStr */ boolean isEmpty = (null == mergeId) ? true : (mergeId.length() == 0); String rowKeyPrefix = (isEmpty) ? fld.name : mergeId + "_" + fld.name; rowKeyP1 = rowKeyPrefix + termWord; rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); appender.setLength(0); rowVal = appender.append(incrementalIdSeekPosition).toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); if (!fld.isBiWord && !fld.isTriWord) continue; /** * Do Three phrase word */ if (null != last2) { appender.setLength(0); rowKeyP1 = appender.append(rowKeyPrefix).append(last2).append(' ').append(last1).append(' ') .append(termWord).append(' ').append('*').toString(); appender.setLength(0); rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); } /** * Do Two phrase word */ if (null != last1) { appender.setLength(0); rowKeyP1 = appender.append(rowKeyPrefix).append(last1).append(' ').append(termWord).append(' ') .append('*').toString(); appender.setLength(0); rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); } last2 = last1; last1 = termWord; } } catch (Exception e) { e.printStackTrace(); System.err.println("Error While tokenizing : " + e.getMessage()); } finally { try { if (null != stream) stream.close(); } catch (Exception ex) { IdSearchLog.l.warn("Error during Tokenizer Stream closure"); } } }
From source file:com.bizosys.unstructured.CustomAnalyzerExample.java
License:Apache License
public static void main(String[] args) throws Exception { Document doc = new Document(); doc.add(new Field("description", "Abinash", Field.Store.NO, Field.Index.ANALYZED)); Analyzer analyzer = new CustomAnalyzerExample(); for (Fieldable field : doc.getFields()) { StringReader sr = new StringReader(field.stringValue()); TokenStream stream = analyzer.tokenStream(field.name(), sr); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { System.out.println(termA.toString()); }// w w w . j av a 2s .c o m sr.close(); } }
From source file:com.bizosys.unstructured.IndexWriter.java
License:Apache License
/** * Find the last offset./*from w w w . ja v a 2 s .c om*/ * Find each term offset * * @param stream * @param docId * @param docType * @param fieldType * @param fieldBoost * @param codecs * @param uniqueTokens * @throws IOException */ private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType, Map<String, IndexRow> uniqueTokens) throws IOException { String token = null; int curoffset = 0; int lastoffset = 0; int position = -1; StringBuilder sb = new StringBuilder(); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { token = termA.toString(); curoffset = offsetA.endOffset(); if (lastoffset != curoffset) position++; lastoffset = curoffset; String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter); sb.delete(0, sb.capacity()); if (uniqueTokens.containsKey(key)) { IndexRow existingRow = uniqueTokens.get(key); existingRow.set(curoffset, position); existingRow.occurance++; } else { IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position); if (null != filter) row.docMeta = filter; uniqueTokens.put(key, row); } } stream.end(); stream.close(); for (IndexRow row : uniqueTokens.values()) cachedIndex.add(row); }
From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { Document doc = new Document(); doc.add(new Field("description", "dress/t-shirt dress for \"good boy\"", Field.Store.NO, Field.Index.ANALYZED)); Analyzer analyzer = new StopwordAndSynonymAnalyzer(); for (Fieldable field : doc.getFields()) { String query = "dress/t-shirt dress for \"good boy\""; StringReader sr = new StringReader(query); TokenStream stream = analyzer.tokenStream(field.name(), sr); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); if (DEBUG_ENABLED) { while (stream.incrementToken()) { IdSearchLog.l.debug("Term:" + termA.toString()); }/*from w w w .j av a 2 s .c om*/ } sr.close(); } analyzer.close(); }
From source file:com.bizosys.unstructured.SynonumAnalyzerExample.java
License:Apache License
public static void main(String[] args) throws Exception { Document doc = new Document(); doc.add(new Field("description", "bengalure is a good city", Field.Store.NO, Field.Index.ANALYZED)); Map<String, String> syn = new HashMap<String, String>(); syn.put("bangalore", "bengalure|bangaluru"); Analyzer analyzer = new StopwordAndSynonymAnalyzer(); //analyzer.load(null, syn); for (Fieldable field : doc.getFields()) { StringReader sr = new StringReader(field.stringValue()); TokenStream stream = analyzer.tokenStream(field.name(), sr); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { System.out.println("Term:" + termA.toString()); }/*from w w w.j a v a2 s. co m*/ sr.close(); } }
From source file:com.bull.aurocontrol.csst.poc.index.interval.InNumericIntervalQuery.java
License:Apache License
/** * Creates a query to find intervals a number is in. * @param name The name of the field to search. * @param value The search value.// w ww .j av a2 s . c o m * @param precisionStep The precision step used when indexing the field. */ public InNumericIntervalQuery(final String name, final long value, final int precisionStep) { super(true); this.value = value; TokenStream stream = new NumericTokenStream(precisionStep).setLongValue(value); try { stream.reset(); while (stream.incrementToken()) { this.add(new TermQuery(new Term(name, stream.getAttribute(TermAttribute.class).term())), BooleanClause.Occur.SHOULD); } } catch (IOException e) { throw new IllegalStateException("This should never happen - NumericTokenStream does no IO."); } }
From source file:com.chimpler.example.bayes.Classifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;/*from w ww . j a v a 2 s . co m*/ } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }