List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:com.bizosys.unstructured.IndexWriter.java
License:Apache License
/** * Find the last offset.//from w ww.j a v a2 s .c o m * Find each term offset * * @param stream * @param docId * @param docType * @param fieldType * @param fieldBoost * @param codecs * @param uniqueTokens * @throws IOException */ private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType, Map<String, IndexRow> uniqueTokens) throws IOException { String token = null; int curoffset = 0; int lastoffset = 0; int position = -1; StringBuilder sb = new StringBuilder(); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { token = termA.toString(); curoffset = offsetA.endOffset(); if (lastoffset != curoffset) position++; lastoffset = curoffset; String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter); sb.delete(0, sb.capacity()); if (uniqueTokens.containsKey(key)) { IndexRow existingRow = uniqueTokens.get(key); existingRow.set(curoffset, position); existingRow.occurance++; } else { IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position); if (null != filter) row.docMeta = filter; uniqueTokens.put(key, row); } } stream.end(); stream.close(); for (IndexRow row : uniqueTokens.values()) cachedIndex.add(row); }
From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { Document doc = new Document(); doc.add(new Field("description", "dress/t-shirt dress for \"good boy\"", Field.Store.NO, Field.Index.ANALYZED)); Analyzer analyzer = new StopwordAndSynonymAnalyzer(); for (Fieldable field : doc.getFields()) { String query = "dress/t-shirt dress for \"good boy\""; StringReader sr = new StringReader(query); TokenStream stream = analyzer.tokenStream(field.name(), sr); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); if (DEBUG_ENABLED) { while (stream.incrementToken()) { IdSearchLog.l.debug("Term:" + termA.toString()); }/* ww w . j av a2s. co m*/ } sr.close(); } analyzer.close(); }
From source file:com.bizosys.unstructured.SynonumAnalyzerExample.java
License:Apache License
public static void main(String[] args) throws Exception { Document doc = new Document(); doc.add(new Field("description", "bengalure is a good city", Field.Store.NO, Field.Index.ANALYZED)); Map<String, String> syn = new HashMap<String, String>(); syn.put("bangalore", "bengalure|bangaluru"); Analyzer analyzer = new StopwordAndSynonymAnalyzer(); //analyzer.load(null, syn); for (Fieldable field : doc.getFields()) { StringReader sr = new StringReader(field.stringValue()); TokenStream stream = analyzer.tokenStream(field.name(), sr); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { System.out.println("Term:" + termA.toString()); }/*from ww w. jav a 2 s.co m*/ sr.close(); } }
From source file:com.bull.aurocontrol.csst.poc.index.interval.InNumericIntervalQuery.java
License:Apache License
/** * Creates a query to find intervals a number is in. * @param name The name of the field to search. * @param value The search value.//from w w w .j a va 2 s . c o m * @param precisionStep The precision step used when indexing the field. */ public InNumericIntervalQuery(final String name, final long value, final int precisionStep) { super(true); this.value = value; TokenStream stream = new NumericTokenStream(precisionStep).setLongValue(value); try { stream.reset(); while (stream.incrementToken()) { this.add(new TermQuery(new Term(name, stream.getAttribute(TermAttribute.class).term())), BooleanClause.Occur.SHOULD); } } catch (IOException e) { throw new IllegalStateException("This should never happen - NumericTokenStream does no IO."); } }
From source file:com.chimpler.example.bayes.Classifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;//from w w w. j a va2s. c o m } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:com.chriscx.stem.Stem.java
public String evaluate(BufferedReader input) { if (input == null) { return null; }/*from ww w .ja v a 2s.co m*/ CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true); String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi " + "autre autres aux auxquelles auxquels avait avant avec c car ce " + "ceci cela celle celles celui cependant certain certaine certaines " + "certains ces cet cette ceux chez ci combien comme comment " + "concernant contre d dans de debout dedans dehors del depuis " + "derrire des dsormais desquelles desquels devers devra doit " + "donc dont du duquel durant ds elle elles en entre environ est" + " et etc eu eux except hormis hors hlas hui il ils j je jusqu " + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs " + "lorsque lui l ma mais malgr me merci mes mien mienne miennes " + "miens moins mon moyennant mme mmes n ne ni non nos notre nous " + "nanmoins ntre ntres on ont ou outre o par parmi partant pas " + "pass pendant plein plus plusieurs pour pourquoi proche prs " + "puisque qu quand que quel quelle quelles quels qui quoi quoique" + " revoici revoil s sa sans sauf se selon seront ses si sien " + "sienne siennes siens sinon soi soit son sont sous suivant sur " + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes" + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre "; String[] stopWordsTab = stopWords.split(" "); for (String word : stopWordsTab) { stopWordsSet.add(word); } Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet); result = ""; try { String line = input.readLine(); line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", ""); line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", ""); line = line.replaceAll("(_|-)+", ""); line = line.replaceAll("(\\n|\\r|\\t)+", ""); line = line.replaceAll("(?![\\._])\\p{P}", ""); while (line != null) { TokenStream stream = analyzer.tokenStream(null, line); stream.reset(); while (stream.incrementToken()) { String wordset = stream.getAttribute(CharTermAttribute.class).toString(); wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", ""); result += wordset + " "; } result += "\n"; stream.close(); line = input.readLine(); } input.close(); return result; } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } }
From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java
License:Apache License
/** * //from w ww . ja va 2 s. com * Counts words * * @param analyzer * @param words * @param in * @throws IOException */ private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { System.out.println("> ----- countWords ------"); // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); System.out.print(" " + s); words.add(s); } System.out.println("\n<"); /*overallCounts.addAll(words);*/ }
From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);/*from w w w . j ava2 s. c o m*/ } }
From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java
License:Apache License
/** * // www. j av a2 s . co m * Counts words * * @param analyzer * @param words * @param in * @throws IOException */ private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { //System.out.println( "> ----- countWords ------" ); // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); //System.out.print( " " + s ); words.add(s); } //System.out.println( "\n<" ); /*overallCounts.addAll(words);*/ }
From source file:com.cloudera.knittingboar.utils.DatasetConverter.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); // System.out.print( " " + s ); words.add(s);/*from w w w .j a v a 2 s . c o m*/ } }