List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:com.tuplejump.stargate.lucene.query.Condition.java
License:Apache License
protected String analyze(String field, String value, Analyzer analyzer) { TokenStream source = null; try {// w ww . ja v a 2 s .c o m source = analyzer.tokenStream(field, value); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) { return null; } termAtt.fillBytesRef(); if (source.incrementToken()) { throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value); } source.end(); return BytesRef.deepCopyOf(bytes).utf8ToString(); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + value, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Mahout Naive Bayesian Classifier"); System.out.println(/* w w w . ja v a 2s .c om*/ "Classifies input text document into a class given a model, dictionary, document frequency and input file"); System.out.println( "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String inputFilePath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from input file Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(inputFilePath)); StringBuilder stringBuilder = new StringBuilder(); String lineSeparator = System.getProperty("line.separator"); String line = null; while ((line = reader.readLine()) != null) { stringBuilder.append(line); stringBuilder.append(lineSeparator); } // Close the reader I/O reader.close(); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from input file TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString())); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the email is more likely // to // be associated to double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; Vector resultVector = classifier.classifyFull(vector); for (Element element : resultVector) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } System.out.println(" Class Labe: => " + labels.get(bestCategoryId)); System.out.println(" Score: => " + bestScore); analyzer.close(); }
From source file:com.weclay.ksearch2.BasicKoreanAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { // text to tokenize //final String text = " ? ?"; //String text = " ,?, ?"; String text = " ??. . DB ? ? ?? , ? ? , , ?, ? ... ? ? ? ? ?."; BasicKoreanAnalyzer analyzer = new BasicKoreanAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the TermAttribute from the TokenStream CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class); stream.reset();/*from w ww . ja v a 2 s . c o m*/ // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt + ": " + termAtt.length() + " (" + offsetAtt.startOffset() + ":" + offsetAtt.endOffset() + ")"); } stream.end(); stream.close(); }
From source file:com.wonders.xlab.healthcloud.IKAnalyzerDemo.java
License:Apache License
public static void main(String[] args) { //IK?truesmart?? Analyzer analyzer = new IKAnalyzer(false); //?LuceneTokenStream TokenStream ts = null; try {//w ww . j av a 2 s . co m ts = analyzer.tokenStream("myfield", new StringReader("??????")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { //??? if (term.toString().length() == 1) { continue; } //????? System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:com.zimbra.cs.index.query.ContactQuery.java
License:Open Source License
public ContactQuery(String text) { TokenStream stream = new ContactTokenFilter( new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text)))); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); try {/*ww w .j a va2 s .co m*/ stream.reset(); while (stream.incrementToken()) { tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters } stream.end(); stream.close(); } catch (IOException e) { // should never happen ZimbraLog.search.error("Failed to tokenize text=%s", text); } }
From source file:com.zimbra.cs.index.query.TextQuery.java
License:Open Source License
TextQuery(TokenStream stream, String field, String text) { this.field = field; this.text = text; try {/*from w w w . j av a 2s . com*/ CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { tokens.add(termAttr.toString()); } stream.end(); stream.close(); } catch (IOException e) { // should never happen ZimbraLog.search.error("Failed to tokenize text=%s", text); } }
From source file:com.zimbra.cs.index.ZimbraAnalyzer.java
License:Open Source License
public static String getAllTokensConcatenated(String fieldName, Reader reader) { StringBuilder toReturn = new StringBuilder(); TokenStream stream = SINGLETON.tokenStream(fieldName, reader); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); try {/*w w w .j av a 2s. c o m*/ stream.reset(); while (stream.incrementToken()) { toReturn.append(term); toReturn.append(' '); } stream.end(); stream.close(); } catch (IOException e) { e.printStackTrace(); //otherwise eat it } return toReturn.toString(); }
From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java
License:Open Source License
/** * We intentionally disable the positionIncrement because we want phrases to match across removed stop words. * * @see PositionIncrementAttribute/*from w w w . j a v a 2s . c o m*/ */ @Test public void positionIncrement() throws Exception { TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT, new StringReader("It's a test.")); PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class); while (stream.incrementToken()) { Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1); } stream.end(); stream.close(); }
From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java
License:Open Source License
public static List<String> toTokens(TokenStream stream) throws IOException { List<String> result = new ArrayList<String>(); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset();// w w w . ja va 2 s .co m while (stream.incrementToken()) { result.add(termAttr.toString()); } stream.end(); return result; }
From source file:de.blizzy.documentr.search.PageFinder.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {// w w w . j a v a 2 s. c o m tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Util.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }