List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerTest.java
License:Apache License
private void assertTokenStream(final TokenStream stream, final String expectedStream) throws Exception { final String[] expectedTokens = expectedStream.split("/"); int count = 0; for (final String expectedToken : expectedTokens) { final String[] attrs = expectedToken.split(","); assertTrue(stream.incrementToken()); final String term = attrs[0]; assertAttribute(count, "term", term, stream.getAttribute(CharTermAttribute.class).toString()); if (attrs.length > 1) { final int so = Integer.parseInt(attrs[1]); assertAttribute(count, "startOffset", so, stream.getAttribute(OffsetAttribute.class).startOffset()); if (attrs.length > 2) { final int eo = Integer.parseInt(attrs[2]); assertAttribute(count, "endOffset", eo, stream.getAttribute(OffsetAttribute.class).endOffset()); if (attrs.length > 3) { final int pi = Integer.parseInt(attrs[3]); assertAttribute(count, "posInc", pi, stream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); }// ww w. j a v a2 s . c o m } } count++; } assertFalse(stream.incrementToken()); }
From source file:lia.recent.CopyOfSearch.java
License:Apache License
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { System.out.println("Hello"); KoreanAnalyzer ka = new KoreanAnalyzer(); TokenStream ts = ka.tokenStream("", new java.io.StringReader("? ? I an Hello")); System.out.println(ts.toString()); try{// w w w . j a va 2 s. c om while (ts.incrementToken()){ org.apache.lucene.analysis.tokenattributes.TermAttribute ta = ts.getAttribute( org.apache.lucene.analysis.tokenattributes.TermAttribute.class); System.out.println("adf"+ta.term()); } }catch (Exception e){System.out.println(e.toString());} }
From source file:lux.index.analysis.QNameTokenFilter.java
License:Mozilla Public License
public final void reset(TokenStream inputAgain) { assert (input.getAttribute(CharTermAttribute.class) == inputAgain.getAttribute(CharTermAttribute.class)); }
From source file:lux.index.analysis.XmlTokenStreamBase.java
License:Mozilla Public License
public void reset(Reader reader) throws IOException { close();//w ww . j ava 2 s .co m TokenStream reset = analyzer.tokenStream(fieldName, reader); // This must be the same token stream: ie the Analyzer must be re-usable, and the // original token stream must have arisen from it. We don't check for actual // identity with wrapped since that might get wrapped again (eg w/QNameTokenFilter). assert (reset.getAttribute(CharTermAttribute.class) == wrapped.getAttribute(CharTermAttribute.class)); }
From source file:mahout.classifier.Classifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;/*from www . j av a 2 s .c o m*/ } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } System.out.println(labels.get(bestCategoryId) + "\t" + tweet); } analyzer.close(); reader.close(); }
From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java
License:Apache License
public static void analyzerByStop(String sentence) throws Exception { Analyzer analyzer = new StopAnalyzer(); TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();//w w w . j a v a 2s . co m while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream .getAttribute(CharTermAttribute.class); System.out.print(charTermAttribute.toString() + " ,"); } analyzer.close(); }
From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java
License:Apache License
public static Set<String> analyzerByStandard(String sentence) throws Exception { Analyzer analyzer = new StandardAnalyzer(); try {/* w w w . j av a 2s . c om*/ TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); Set<String> words = new HashSet<>(); while (tokenStream.incrementToken()) { words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString()); } return words; } finally { analyzer.close(); } }
From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java
License:Apache License
public static Set<String> analyzer(String sentence) throws Exception { if (StringUtils.isEmpty(sentence)) { return Collections.emptySet(); }/*w ww . j a va 2 s . co m*/ Analyzer analyzer = new StandardAnalyzer(); try { TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); Set<String> words = new LinkedHashSet<>(); while (tokenStream.incrementToken()) { String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString(); if (word.length() <= 1) { continue; } words.add(word); } return words; } finally { analyzer.close(); } }
From source file:mvm.rya.indexing.accumulo.freetext.LuceneTokenizer.java
License:Apache License
@Override public SortedSet<String> tokenize(String string) { SortedSet<String> set = new TreeSet<String>(); try {//w w w . j av a 2 s . co m TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { set.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return set; }
From source file:net.mad.ads.server.utils.http.KeywordUtils.java
License:Open Source License
public static List<String> getTokens(String queryString) { try {/*from ww w . j a va 2 s . c o m*/ GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33); TokenStream ts = a.tokenStream("", new StringReader(queryString)); List<String> tokens = new ArrayList<String>(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = termAtt.toString(); tokens.add(token); } ts.end(); ts.close(); return tokens; } catch (IOException e) { logger.error("", e); } return null; }