List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:com.tuplejump.stargate.lucene.query.Condition.java
License:Apache License
protected String analyze(String field, String value, Analyzer analyzer) { TokenStream source = null; try {//from w ww.j a v a 2s . c o m source = analyzer.tokenStream(field, value); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) { return null; } termAtt.fillBytesRef(); if (source.incrementToken()) { throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value); } source.end(); return BytesRef.deepCopyOf(bytes).utf8ToString(); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + value, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Mahout Naive Bayesian Classifier"); System.out.println(// ww w .j a v a 2 s .c om "Classifies input text document into a class given a model, dictionary, document frequency and input file"); System.out.println( "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String inputFilePath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from input file Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(inputFilePath)); StringBuilder stringBuilder = new StringBuilder(); String lineSeparator = System.getProperty("line.separator"); String line = null; while ((line = reader.readLine()) != null) { stringBuilder.append(line); stringBuilder.append(lineSeparator); } // Close the reader I/O reader.close(); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from input file TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString())); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the email is more likely // to // be associated to double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; Vector resultVector = classifier.classifyFull(vector); for (Element element : resultVector) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } System.out.println(" Class Labe: => " + labels.get(bestCategoryId)); System.out.println(" Score: => " + bestScore); analyzer.close(); }
From source file:com.wiseowl.WiseOwl.query.WiseOwlQParser.java
License:Apache License
@Override public Query parse() throws SyntaxError { //<start id="qqp.parse"/> Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/> /*/*from ww w .jav a2 s . c om*/ <calloutlist> <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>. The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout> </calloutlist> */ //<end id="qqp.parse"/> //<start id="qqp.answerType"/> // String type = "P"; String type = atc.computeAnswerType(parse); String mt = atm.get(type); if (mt.equals("DESCRIPTION")) { BooleanQuery bq; BooleanQuery.Builder builder = new BooleanQuery.Builder(); //BooleanQuery bq=new BooleanQuery(false, 0); String field = "text"; SchemaField sf = req.getSchema().getFieldOrNull(field); try { Analyzer analyzer = sf.getType().getQueryAnalyzer(); TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr)); ts.reset(); CharTermAttribute tok = null; while (ts.incrementToken()) {//<co id="qqp.addTerms"/> tok = ts.getAttribute(CharTermAttribute.class); String term = tok.toString(); //ts.reset(); //log.warn("terms {} ",term); builder.add(new TermQuery(new Term(field, term)), BooleanClause.Occur.SHOULD); } ts.close(); } catch (IOException e) { throw new SyntaxError(e.getLocalizedMessage()); } bq = builder.build(); return bq; //return new TermQuery(new Term("title", "she")); } else { //<end id="qqp.answerType"/> String field = "text"; //params.get(QUERY_FIELD); //String field="text"; SchemaField sp = req.getSchema().getFieldOrNull(field); if (sp == null) { throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field); } //<start id="qqp.query"/> List<SpanQuery> sql = new ArrayList<SpanQuery>(); if (mt != null) {//<co id="qqp.handleAT"/> String[] parts = mt.split("\\|"); if (parts.length == 1) { sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase()))); } else { for (int pi = 0; pi < parts.length; pi++) { sql.add(new SpanTermQuery(new Term(field, parts[pi].toLowerCase()))); } } } log.warn("answer type mt : {} {} ", mt, type); FocusNoun fn = new FocusNoun(); String fnn[] = null; try { fnn = fn.getFocusNoun(qstr); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { Analyzer analyzer = sp.getType().getQueryAnalyzer(); TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr)); ts.reset(); CharTermAttribute tok = null; while (ts.incrementToken()) {//<co id="qqp.addTerms"/> tok = ts.getAttribute(CharTermAttribute.class); String term = tok.toString(); log.warn("terms boosted {} ", term); if (fnn != null) if (term.equals(fnn[0]) || term.equals(fnn[1])) { SpanQuery sq = new SpanTermQuery(new Term(field, term)); sql.add(new SpanBoostQuery(sq, 100f)); } else { SpanQuery sq = new SpanTermQuery(new Term(field, term)); sql.add(new SpanBoostQuery(sq, 5f)); } // sql.add(new SpanTermQuery(new Term(field, term))); } ts.close(); } catch (IOException e) { throw new SyntaxError(e.getLocalizedMessage()); } return new SpanOrQuery(sql.toArray(new SpanQuery[sql.size()])); // return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(OWLParams.SLOP, 10), true);//<co id="qqp.spanNear"/> /* <calloutlist> <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout> <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout> <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout> </calloutlist> */ //<end id="qqp.query"/> } }
From source file:com.xiaomi.linden.lucene.analyzer.CommonMMSeg4jSegmenter.java
License:Apache License
@Override public List<Term> parse(String content) throws Exception { List<Term> words = new ArrayList<>(); if (content == null || content.isEmpty()) { return words; }//from w w w. j av a 2 s . co m TokenStream stream = null; try { stream = analyzer.tokenStream("", content); stream.reset(); if (stopWords != null) { if (cutLetterDigit) { stream = new CutLetterDigitFilter(new StopFilter(stream, stopWords)); } else { stream = new StopFilter(stream, stopWords); } } else { if (cutLetterDigit) { stream = new CutLetterDigitFilter(stream); } } CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { words.add( new Term(termAttr.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset())); } } catch (IOException e) { throw new Exception(content + " extract words from phrase failed!", e); } finally { if (stream != null) { stream.close(); } } return words; }
From source file:com.xiaomi.linden.lucene.analyzer.TestLindenWordDelimiterAnalyzer.java
License:Apache License
@Test public void testLindenWordDelimiterAnalyzer() throws Exception { LindenWordDelimiterAnalyzerFactory wordDelimiterAnalyzerFactory = new LindenWordDelimiterAnalyzerFactory(); Map<String, String> args = new HashMap<>(); Map<String, String> lastargs = new HashMap<>(); args.put("luceneMatchVersion", "LUCENE_4_10_0"); lastargs.putAll(args);/*w ww. jav a 2 s . c om*/ Analyzer analyzer = wordDelimiterAnalyzerFactory.getInstance(args); TokenStream stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. " + "" + "created2018by sls sun-li-shun SunLiShun")); String expected = "[hello][test][case][][][][][][][][][][][][created][2018][sls][sun][li][shun][sun][li][shun]"; String out = ""; stream.reset(); while (stream.incrementToken()) { out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]"; } Assert.assertEquals(expected, out); args.put("lower.case", "false"); args.putAll(lastargs); lastargs.putAll(args); analyzer = wordDelimiterAnalyzerFactory.getInstance(args); stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. " + "" + "created2018by sls on 20140707")); expected = "[Hello][test][case][][][][][][][][][][][][created][2018][sls][20140707]"; out = ""; stream.reset(); while (stream.incrementToken()) { out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]"; } Assert.assertEquals(expected, out); args.put("set.stopwords", "false"); args.putAll(lastargs); lastargs.putAll(args); analyzer = wordDelimiterAnalyzerFactory.getInstance(args); stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. " + "" + "created2018by sls on 20140707")); expected = "[Hello][this][is][a][test][case][][][][][][][][][][][][created][2018][by][sls][on][20140707]"; out = ""; stream.reset(); while (stream.incrementToken()) { out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]"; } Assert.assertEquals(expected, out); args.putAll(lastargs); args.put("splitOnCaseChange", "0"); args.put("set.stopwords", "false"); args.put("lower.case", "true"); lastargs.putAll(args); analyzer = wordDelimiterAnalyzerFactory.getInstance(args); stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. " + "" + "created2018by sls sun-li-shun SunLiShun")); expected = "[hello][this][is][a][test][case][][][][][][][][][][][][created][2018][by][sls][sun][li][shun][sunlishun]"; out = ""; stream.reset(); while (stream.incrementToken()) { out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]"; } Assert.assertEquals(expected, out); }
From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleQuery.java
License:Apache License
private List<SegToken> parseToTokens(String content, float boost) throws IOException { List<SegToken> tokens = new ArrayList<>(); TokenStream stream = analyzer.tokenStream("", new StringReader(content)); try {/* w w w . j a va 2 s. c om*/ CharTermAttribute term = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { tokens.add(new SegToken(term.toString(), boost)); } } finally { if (stream != null) stream.close(); } return tokens; }
From source file:com.zb.mmseg.analysis.TokenUtils.java
License:Open Source License
/** * @param input/*from w w w. ja va2s . c o m*/ * @param reusableToken is null well new one auto. * @return null - if not next token or input is null. * @throws IOException */ public static Token nextToken(TokenStream input, Token reusableToken) throws IOException { if (input == null) { return null; } if (!input.incrementToken()) { return null; } CharTermAttribute termAtt = input.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = input.getAttribute(TypeAttribute.class); if (reusableToken == null) { reusableToken = new Token(); } reusableToken.clear(); if (termAtt != null) { // lucene 3.0 // reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength()); // lucene 3.1 reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length()); } if (offsetAtt != null) { // lucene 3.1 // reusableToken.setStartOffset(offsetAtt.startOffset()); // reusableToken.setEndOffset(offsetAtt.endOffset()); // lucene 4.0 reusableToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); } if (typeAtt != null) { reusableToken.setType(typeAtt.type()); } return reusableToken; }
From source file:CopulaResources.TermCooccurence.java
private static List tokenizeString(Analyzer analyzer, String str) { List result = new ArrayList<>(); try {//from ww w. java2 s . c om TokenStream stream = analyzer.tokenStream(null, new StringReader(str)); stream.reset(); while (stream.incrementToken()) result.add(stream.getAttribute(CharTermAttribute.class).toString()); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:ddf.catalog.pubsub.criteria.contextual.ContextualEvaluator.java
License:Open Source License
private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument, String analyzerName) throws IOException { if (!LOGGER.isDebugEnabled()) { return;//from www . j a v a2 s. co m } TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument)); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); LOGGER.debug("----- {} tokens -----", analyzerName); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); LOGGER.debug(term); } LOGGER.debug("----- END: {} tokens -----", analyzerName); }
From source file:de.blizzy.documentr.search.PageFinder.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {/*from w ww . ja v a 2 s . c o m*/ tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Util.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }