List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java
License:Open Source License
/** * This function assumes that the TFIDF vector of the document containing text is already * given. We simply build a tfidf-vector of the text out of the docVector. * The purpose of doing this is to save the time computing the tf-idf value for words in * the same document.//from w ww .j av a2 s . co m * * @param text * @param docVector * @return */ public Map<String, Float> TextTFIDFVector(String text, Map<String, Float> docVector) { Map<String, Float> map = new HashMap<String, Float>(); //preprocess the text using StandardAnalyzer (StandardAnalyzer2 + StopAnalyzer). StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34); TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(text)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); if (docVector.containsKey(term)) map.put(term, docVector.get(term)); } } catch (Exception e) { e.printStackTrace(); } analyzer.close(); return map; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public boolean contain(String label) { try {/*from w w w . j a v a 2 s . c o m*/ IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // use the boolean query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.MUST); } ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label); searcher.search(bq, collector); boolean ret = collector.isExistQueryLabel(); reader.close(); return ret; } catch (Exception e) { e.printStackTrace(); } return false; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) { TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>(); if (query == null) { ret.add(new SimilarLabelQueryResult(null, 1)); return ret; }/* w ww .jav a2 s . c o m*/ try { IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // get terms from query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); SynonymMap synMap = SynonymIndex.getSynonymMap(); HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); // expand using synonyms for (String syn : synMap.getSynonyms(s)) { stemer.setCurrent(syn); stemer.stem(); syn = stemer.getCurrent(); if (expandedQueryTermSet.add(syn)) { term = new Term(LabelDocument.FIELD_LABEL, syn); termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); } } } // search in the label index SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet, similarity); searcher.search(bq, collector); ret = collector.getQueryResult(); searcher.close(); reader.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:cn.edu.thss.iise.beehivez.server.index.luceneindex.analyzer.SemicolonAnalyzer.java
License:Open Source License
/** * @param args/* w w w. j a v a 2 s. com*/ */ public static void main(String[] args) throws IOException { // text to tokenize final String text = "This is a demo of , the new TokenStream API"; SemicolonAnalyzer analyzer = new SemicolonAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the TermAttribute from the TokenStream TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt.term()); } stream.end(); stream.close(); }
From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java
License:Open Source License
/** * tokenize the given string, all the words are extracted, lowercased, all * the stop words are removed, and all the words are replaced with their * stem//ww w .j a v a 2s . c o m * * @param label * @return */ public static HashSet<String> snowballTokenize(String label) { HashSet<String> ret = new HashSet<String>(); try { Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English", StandardAnalyzer.STOP_WORDS_SET); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { ret.add(termAtt.term()); } stream.end(); stream.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:com.aliasi.lingmed.medline.SearchableMedlineCodec.java
License:Lingpipe license
public static void main(String[] args) throws Exception { org.apache.lucene.store.RAMDirectory directory = new org.apache.lucene.store.RAMDirectory(); // org.apache.lucene.analysis.SimpleAnalyzer analyzer // = new org.apache.lucene.analysis.SimpleAnalyzer(); // org.apache.lucene.analysis.KeywordAnalyzer analyzer // = new org.apache.lucene.analysis.KeywordAnalyzer(); MedlineCodec codec = new MedlineCodec(); Analyzer analyzer = codec.getAnalyzer(); org.apache.lucene.index.IndexWriterConfig iwConf = new org.apache.lucene.index.IndexWriterConfig( org.apache.lucene.util.Version.LUCENE_36, analyzer); iwConf.setOpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND); org.apache.lucene.index.IndexWriter indexWriter = new org.apache.lucene.index.IndexWriter(directory, iwConf);//from ww w . j av a 2 s .c om Document doc = new Document(); doc.add(new Field(Fields.MESH_MINOR_FIELD, "abc", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field(Fields.MESH_MINOR_FIELD, " xyz efg", Field.Store.NO, Field.Index.ANALYZED)); indexWriter.addDocument(doc); indexWriter.close(); org.apache.lucene.index.IndexReader reader = org.apache.lucene.index.IndexReader.open(directory); org.apache.lucene.search.IndexSearcher searcher = new org.apache.lucene.search.IndexSearcher(reader); org.apache.lucene.queryParser.QueryParser qp = new org.apache.lucene.queryParser.QueryParser( org.apache.lucene.util.Version.LUCENE_36, "foo", analyzer); org.apache.lucene.search.Query query = qp.parse(Fields.MESH_MINOR_FIELD + ":efg"); org.apache.lucene.search.TopDocs hits = searcher.search(query, 1000); System.out.println("hits.length()=" + hits.scoreDocs.length); org.apache.lucene.analysis.TokenStream ts = analyzer.tokenStream(Fields.MESH_MINOR_FIELD, new java.io.StringReader("abc xyz efg")); org.apache.lucene.analysis.tokenattributes.CharTermAttribute terms = ts .addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsets = ts .addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute positions = ts .addAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); while (ts.incrementToken()) { int increment = positions.getPositionIncrement(); int start = offsets.startOffset(); int end = offsets.endOffset(); String term = terms.toString(); System.out.println("token=|" + term + "|" + " startOffset=" + start + " endOffset=" + end + " positionIncr=" + increment); } }
From source file:com.billiger.solr.handler.component.QLTBComponent.java
License:Apache License
/** * Get analyzed version of the query string. * * This uses the analyzer for the configured FieldType for this * component to analyze and re-assemble the original query string. * If no queryFieldType is configured, the original query will be * returned.// w w w. j av a 2s . c om * * This is used both in the prepare() stage of the component and * when reading the QLTB map data. */ String getAnalyzedQuery(String query) throws IOException { if (analyzer == null) { return query; } StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(query)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } tokens.end(); tokens.close(); return norm.toString(); }
From source file:com.chimpler.example.bayes.Classifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;//from w ww . j a va 2 s.c om } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java
License:Apache License
/** * //from w w w. j a v a 2 s. com * Counts words * * @param analyzer * @param words * @param in * @throws IOException */ private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { System.out.println("> ----- countWords ------"); // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); System.out.print(" " + s); words.add(s); } System.out.println("\n<"); /*overallCounts.addAll(words);*/ }
From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);//from w ww . j a v a 2 s . c o m } }