List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:de.blizzy.documentr.search.PageIndex.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {/*from w w w. j a v a2 s .c om*/ tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Closeables.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java
License:Open Source License
private String analyze(String aFieldName, String aString) throws IOException { TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString); theTokenStream.reset();//from ww w.j a v a 2 s .c o m CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class); try { if (theTokenStream.incrementToken()) { return theCharTerms.toString(); } return null; } finally { theTokenStream.end(); theTokenStream.close(); } }
From source file:de.twitterlivesearch.analysis.Tokenizer.java
License:Apache License
/** * @param stringToAnalyze/* w w w.j a va 2 s . c o m*/ * String to be tokenized * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used * for analysis * * @return list of tokens */ public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze)); stream.reset(); while (stream.incrementToken()) { tokens.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return tokens; }
From source file:di.uniba.it.tri.occ.BuildOccurrence.java
License:Open Source License
private List<String> getTokens(Reader reader) throws IOException { List<String> tokens = new ArrayList<>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream tokenStream = analyzer.tokenStream("text", reader); tokenStream.reset();/*from ww w . j a va 2s .c o m*/ CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String token = cattr.toString(); String[] split = token.split("'"); if (split.length == 1) { tokens.add(token); } else { int max = 0; int index = 0; for (int i = 0; i < split.length; i++) { if (split[i].length() > max) { max = split[i].length(); index = i; } } tokens.add(split[index]); } } tokenStream.end(); return tokens; }
From source file:doc2vec.LuceneDocIterator.java
String preProcess(Analyzer analyzer, String text) throws Exception { StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//from w w w . ja va 2s . c o m while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); if (labelsStoredWithWords) { term = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM)[0]; // the first part is the word } if (!term.trim().equals("")) tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); return tokenizedContentBuff.toString(); }
From source file:drakkar.mast.retrieval.analysis.NGramQuery.java
/** * * @param analyzer/*from w w w. ja v a 2s .c o m*/ * @param queryTerm * @param field * @throws IOException */ public NGramQuery(Analyzer analyzer, String queryTerm, String field) throws IOException { String words[] = null; //remove white spaces if (queryTerm.contains(" ")) { words = queryTerm.split(" "); } else { words = new String[1]; words[0] = queryTerm; } //one term if (words.length > 1) { for (int i = 0; i < words.length; i++) { String string = words[i]; Term t = new Term(field, string); TermQuery pquery = new TermQuery(t); add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD); } } else { //more than one term for (int i = 0; i < words.length; i++) { String wordToAnalyze = words[i]; TokenStream tokens = analyzer.tokenStream(field, new StringReader(wordToAnalyze)); TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class); tokens.reset(); TermQuery pquery; for (; tokens.incrementToken(); add( new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) { Term t = new Term(field, termAtt.term()); pquery = new TermQuery(t); } tokens.end(); tokens.close(); } } }
From source file:drakkar.mast.retrieval.ngram.NGramQuery.java
/** * * @param a//from w ww . ja va 2 s . c o m * @param queryTerm * @param field * @throws IOException */ public NGramQuery(Analyzer a, String queryTerm, String field) throws IOException { String words[] = null; if (queryTerm.contains(" ")) { words = queryTerm.split(" "); } else { words = new String[1]; words[0] = queryTerm; } if (words.length > 1) { for (int i = 0; i < words.length; i++) { String string = words[i]; Term t = new Term(field, string); TermQuery pquery = new TermQuery(t); add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD); } } else { for (int i = 0; i < words.length; i++) { String wordToAnalyze = words[i]; TokenStream tokens = a.tokenStream(field, new StringReader(wordToAnalyze)); TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class); tokens.reset(); TermQuery pquery; for (; tokens.incrementToken(); add( new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) { Term t = new Term(field, termAtt.term()); pquery = new TermQuery(t); } tokens.end(); tokens.close(); } } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java
License:Open Source License
protected void computeTFIDF(List<TFIDFTerm> wordList, int totalWordsDoc) { if (reader != null && searcher != null) { double tf; double idf; double tfidf; EnglishAnalyzer analyzer = new EnglishAnalyzer(Version.LUCENE_40); TokenStream stream = null; CharTermAttribute termAtt;//from ww w . ja va 2 s .c om String term; double totalWikiDocs = (double) reader.numDocs(); for (TFIDFTerm word : wordList) { try { term = ""; stream = analyzer.tokenStream("field", new StringReader(word.word)); termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { term += (termAtt.toString()); } // System.out.println(term); stream.end(); tf = (double) word.count / (double) totalWordsDoc; double wikiTermFrec = reader.docFreq(new Term("contents", term)); if (wikiTermFrec != 0) { idf = Math.log(totalWikiDocs / wikiTermFrec); tfidf = tf * idf; } else { tfidf = 0; } word.tfidf = tfidf; } catch (IOException ex) { logger.error("Error processing the TFIDF", ex); } finally { try { if (stream != null) { stream.close(); } } catch (IOException ex) { logger.error("Error processing the TFIDF", ex); } } } try { reader.close(); } catch (IOException ex) { logger.warn("Error closing lucene reader", ex); } } }
From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java
License:Apache License
/** * Low level api to get the most relevant (formatted) sections of the document. * This method has been made public to allow visibility of score information held in TextFragment objects. * Thanks to Jason Calabrese for help in redefining the interface. * @param tokenStream/*from w w w . ja v a2 s .co m*/ * @param text * @param maxNumFragments * @param mergeContiguousFragments * @throws IOException * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length */ public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>(); StringBuilder newText = new StringBuilder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); } TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; textFragmenter.start(text, tokenStream); TokenGroup tokenGroup = new TokenGroup(tokenStream); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) { throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length()); } if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(endOffset, lastEndOffset); tokenGroup.clear(); //check if current token marks the start of a new fragment if (textFragmenter.isNewFragment()) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment currentFrag.textEndPos = newText.length(); currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); } } tokenGroup.addToken(fragmentScorer.getTokenScore()); // if(lastEndOffset>maxDocBytesToAnalyze) // { // break; // } } currentFrag.setScore(fragmentScorer.getFragmentScore()); if (tokenGroup.numTokens > 0) { //flush the accumulated text (same code as in above loop) startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(lastEndOffset, endOffset); } //Test what remains of the original text beyond the point where we stopped analyzing if ( // if there is text beyond the last token considered.. (lastEndOffset < text.length()) && // and that text is not too large... (text.length() <= maxDocCharsToAnalyze)) { //append it to the last fragment newText.append(encoder.encodeText(text.substring(lastEndOffset))); } currentFrag.textEndPos = newText.length(); //sort the most relevant sections of the text for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) { currentFrag = i.next(); //If you are running with a version of Lucene before 11th Sept 03 // you do not have PriorityQueue.insert() - so uncomment the code below /* if (currentFrag.getScore() >= minScore) { fragQueue.put(currentFrag); if (fragQueue.size() > maxNumFragments) { // if hit queue overfull fragQueue.pop(); // remove lowest in hit queue minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore } } */ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! fragQueue.insertWithOverflow(currentFrag); } //return the most relevant fragments TextFragment frag[] = new TextFragment[fragQueue.size()]; for (int i = frag.length - 1; i >= 0; i--) { frag[i] = fragQueue.pop(); } //merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(frag); ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>(); for (int i = 0; i < frag.length; i++) { if ((frag[i] != null) && (frag[i].getScore() > 0)) { fragTexts.add(frag[i]); } } frag = fragTexts.toArray(new TextFragment[0]); } return frag; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:edu.stanford.rad.naivebayes.ClassifyLines.java
License:Apache License
public static void main(String[] args) throws Exception { // if (args.length < 5) { // System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); // return; // }/*w w w . j a va2s . c o m*/ // String modelPath = args[0]; // String labelIndexPath = args[1]; // String dictionaryPath = args[2]; // String documentFrequencyPath = args[3]; // String tweetsPath = args[4]; String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb"; String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex"; String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0"; String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000"; String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt"; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }