List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:lucandra.IndexWriter.java
License:Apache License
@SuppressWarnings("unchecked") public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { List<String> allIndexedTerms = new ArrayList<String>(); // check for special field name String docId = doc.get(CassandraUtils.documentIdField); if (docId == null) docId = Long.toHexString((long) (System.nanoTime() + (Math.random() * System.nanoTime()))); int position = 0; for (Fieldable field : (List<Fieldable>) doc.getFields()) { // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { String term = CassandraUtils.createColumnName(field.name(), field.stringValue()); allIndexedTerms.add(term);/*ww w. j av a 2 s . co m*/ String key = indexName + CassandraUtils.delimeter + term; Map<String, List<Number>> termMap = new HashMap<String, List<Number>>(); termMap.put(CassandraUtils.termFrequencyKey, CassandraUtils.emptyArray); termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray); CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily, docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, termMap); } else if (field.isIndexed()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } // collect term information per field Map<String, Map<String, List<Number>>> allTermInformation = new HashMap<String, Map<String, List<Number>>>(); int lastOffset = 0; if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } // Build the termPositions vector for all terms tokens.reset(); // reset the TokenStream to the first token // set up token attributes we are working on // offsets OffsetAttribute offsetAttribute = null; if (field.isStoreOffsetWithTermVector()) offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); // positions PositionIncrementAttribute posIncrAttribute = null; if (field.isStorePositionWithTermVector()) posIncrAttribute = (PositionIncrementAttribute) tokens .addAttribute(PositionIncrementAttribute.class); TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class); // store normalizations of field per term per document rather // than per field. // this adds more to write but less to read on other side Integer tokensInField = new Integer(0); while (tokens.incrementToken()) { tokensInField++; String term = CassandraUtils.createColumnName(field.name(), termAttribute.term()); allIndexedTerms.add(term); // fetch all collected information for this term Map<String, List<Number>> termInfo = allTermInformation.get(term); if (termInfo == null) { termInfo = new HashMap<String, List<Number>>(); allTermInformation.put(term, termInfo); } // term frequency { List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey); if (termFrequency == null) { termFrequency = new ArrayList<Number>(); termFrequency.add(new Integer(0)); termInfo.put(CassandraUtils.termFrequencyKey, termFrequency); } // increment termFrequency.set(0, termFrequency.get(0).intValue() + 1); } // position vector if (field.isStorePositionWithTermVector()) { position += (posIncrAttribute.getPositionIncrement() - 1); List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey); if (positionVector == null) { positionVector = new ArrayList<Number>(); termInfo.put(CassandraUtils.positionVectorKey, positionVector); } positionVector.add(++position); } // term offsets if (field.isStoreOffsetWithTermVector()) { List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey); if (offsetVector == null) { offsetVector = new ArrayList<Number>(); termInfo.put(CassandraUtils.offsetVectorKey, offsetVector); } offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset()); } } List<Number> bnorm = null; if (!field.getOmitNorms()) { bnorm = new ArrayList<Number>(); float norm = doc.getBoost(); norm *= field.getBoost(); norm *= similarity.lengthNorm(field.name(), tokensInField); bnorm.add(Similarity.encodeNorm(norm)); } for (Map.Entry<String, Map<String, List<Number>>> term : allTermInformation.entrySet()) { // Terms are stored within a unique key combination // This is required since cassandra loads all columns // in a key/column family into memory String key = indexName + CassandraUtils.delimeter + term.getKey(); // Mix in the norm for this field alongside each term // more writes but faster on read side. if (!field.getOmitNorms()) { term.getValue().put(CassandraUtils.normsKey, bnorm); } CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily, docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, term.getValue()); } } // Stores each field as a column under this doc key if (field.isStored()) { byte[] _value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8"); // first byte flags if binary or not byte[] value = new byte[_value.length + 1]; System.arraycopy(_value, 0, value, 0, _value.length); value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE); String key = indexName + CassandraUtils.delimeter + docId; CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily, field.name().getBytes("UTF-8"), CassandraUtils.hashKey(key), value, null); } } // Finally, Store meta-data so we can delete this document String key = indexName + CassandraUtils.delimeter + docId; CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily, CassandraUtils.documentMetaField.getBytes("UTF-8"), CassandraUtils.hashKey(key), CassandraUtils.toBytes(allIndexedTerms), null); if (autoCommit) CassandraUtils.robustBatchInsert(client, getMutationMap()); }
From source file:lux.search.highlight.XmlHighlighter.java
License:Mozilla Public License
private void init(TokenStream tokenStream) { try {/*from w w w . j av a2 s . c o m*/ tokenStream.reset(); scorer.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); scorerTokens = scorer.init(tokenStream); if (scorerTokens == null) { // The scorer didn't consume any tokens (it does that for PhraseQuery), // in which case we must give it the live token stream scorer.init(xmlStreamTokens); } // we score the entire document as a single fragment scorer.startFragment(new TextFragment("", 0, 0)); } catch (IOException e) { throw new LuxException(e); } }
From source file:mahout.classifier.Classifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;/* w w w.j av a2s . com*/ } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } System.out.println(labels.get(bestCategoryId) + "\t" + tweet); } analyzer.close(); reader.close(); }
From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java
License:Apache License
public static void analyzerByStop(String sentence) throws Exception { Analyzer analyzer = new StopAnalyzer(); TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream .getAttribute(CharTermAttribute.class); System.out.print(charTermAttribute.toString() + " ,"); }/*from w w w .j a v a 2 s . co m*/ analyzer.close(); }
From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java
License:Apache License
public static Set<String> analyzerByStandard(String sentence) throws Exception { Analyzer analyzer = new StandardAnalyzer(); try {/*w w w . j av a 2 s.c om*/ TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); Set<String> words = new HashSet<>(); while (tokenStream.incrementToken()) { words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString()); } return words; } finally { analyzer.close(); } }
From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java
License:Apache License
public static Set<String> analyzer(String sentence) throws Exception { if (StringUtils.isEmpty(sentence)) { return Collections.emptySet(); }/*from w w w . ja va 2 s . c om*/ Analyzer analyzer = new StandardAnalyzer(); try { TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); Set<String> words = new LinkedHashSet<>(); while (tokenStream.incrementToken()) { String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString(); if (word.length() <= 1) { continue; } words.add(word); } return words; } finally { analyzer.close(); } }
From source file:mvm.rya.indexing.accumulo.freetext.LuceneTokenizer.java
License:Apache License
@Override public SortedSet<String> tokenize(String string) { SortedSet<String> set = new TreeSet<String>(); try {/* www . j a v a 2 s.c o m*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { set.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return set; }
From source file:net.mad.ads.server.utils.http.KeywordUtils.java
License:Open Source License
public static List<String> getTokens(String queryString) { try {//w ww . j ava2 s. c o m GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33); TokenStream ts = a.tokenStream("", new StringReader(queryString)); List<String> tokens = new ArrayList<String>(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = termAtt.toString(); tokens.add(token); } ts.end(); ts.close(); return tokens; } catch (IOException e) { logger.error("", e); } return null; }
From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java
License:Open Source License
private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException { TokenStream ts = analyzer.tokenStream(fld, new StringReader(val)); try {//from w w w.j av a 2 s . c o m ts.reset(); // Iterate over tokens and treat each token as term int pos = 0; while (ts.incrementToken()) { CharTermAttribute t = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class); pos += p.getPositionIncrement(); phrase.add(new Term(fld, t.toString()), pos - 1); } // End-of-stream clean-up ts.end(); } finally { ts.close(); } }
From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java
License:Open Source License
public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max, int threshold, HashMap<String, String> attributes) { float searchThreshold = (float) threshold; if (threshold < 0) searchThreshold = 0.0f;/*from ww w . j ava 2s.co m*/ if (threshold > 100) searchThreshold = 100.0f; String queryText = genericText; String gtextFName = TmEntry.GTEXT_PREFIX + locale; Locale javaLoc = new Locale(locale); // create basic ngram analyzer to tokenize query TokenStream queryTokenStream; if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) { queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText)); } else { queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText)); } // Get the TermAttribute from the TokenStream CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class); TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName); try { queryTokenStream.reset(); while (queryTokenStream.incrementToken()) { //Term t = new Term(keyIndexField, new String(termAtt.buffer())); Term t = new Term(gtextFName, termAtt.toString()); fQuery.add(t); } queryTokenStream.end(); queryTokenStream.close(); } catch (IOException e) { throw new OkapiIOException(e.getMessage(), e); } return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes); }