List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:org.apache.mahout.classifier.sgd.NewsgroupHelper.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in, Multiset<String> overallCounts) throws IOException { TokenStream ts = analyzer.reusableTokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset();/*from ww w.j a v a2 s .c o m*/ while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s); } overallCounts.addAll(words); }
From source file:org.apache.mahout.classifier.sgd.TrainNewsGroups.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);/*from w w w . ja va 2s. c om*/ } overallCounts.addAll(words); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
private static void validateTokens(String[] expected, TokenStream ts) throws IOException { int pos = 0;// w w w. jav a 2 s .co m while (ts.incrementToken()) { assertTrue("Analyzer produced too many tokens", pos <= expected.length); CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class); assertEquals("Unexpected term", expected[pos++], termAttr.toString()); } assertEquals("Analyzer produced too few terms", expected.length, pos); }
From source file:org.apache.solr.analysis.TestTrimFilter.java
License:Apache License
public void testTrim() throws Exception { char[] a = " a ".toCharArray(); char[] b = "b ".toCharArray(); char[] ccc = "cCc".toCharArray(); char[] whitespace = " ".toCharArray(); char[] empty = "".toCharArray(); TokenStream ts = new TrimFilter(new IterTokenStream(new Token(a, 0, a.length, 1, 5), new Token(b, 0, b.length, 6, 10), new Token(ccc, 0, ccc.length, 11, 15), new Token(whitespace, 0, whitespace.length, 16, 20), new Token(empty, 0, empty.length, 21, 21)), false);/* w w w . j a va 2 s. c om*/ TermAttribute token; assertTrue(ts.incrementToken()); token = (TermAttribute) ts.getAttribute(TermAttribute.class); assertEquals("a", new String(token.termBuffer(), 0, token.termLength())); assertTrue(ts.incrementToken()); assertEquals("b", new String(token.termBuffer(), 0, token.termLength())); assertTrue(ts.incrementToken()); assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength())); assertTrue(ts.incrementToken()); assertEquals("", new String(token.termBuffer(), 0, token.termLength())); assertTrue(ts.incrementToken()); assertEquals("", new String(token.termBuffer(), 0, token.termLength())); assertFalse(ts.incrementToken()); a = " a".toCharArray(); b = "b ".toCharArray(); ccc = " c ".toCharArray(); whitespace = " ".toCharArray(); ts = new TrimFilter( new IterTokenStream(new Token(a, 0, a.length, 0, 2), new Token(b, 0, b.length, 0, 2), new Token(ccc, 0, ccc.length, 0, 3), new Token(whitespace, 0, whitespace.length, 0, 3)), true); List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3"); List<Token> real = getTokens(ts); for (Token t : expect) { System.out.println("TEST:" + t); } for (Token t : real) { System.out.println("REAL:" + t); } assertTokEqualOff(expect, real); }
From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java
License:Apache License
private void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int posIncs[]) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts .getAttribute(PositionIncrementAttribute.class); for (int i = 0; i < output.length; i++) { assertTrue(ts.incrementToken()); assertEquals(output[i], termAtt.term()); assertEquals(startOffsets[i], offsetAtt.startOffset()); assertEquals(endOffsets[i], offsetAtt.endOffset()); assertEquals(posIncs[i], posIncAtt.getPositionIncrement()); }//w ww . j a v a 2 s .co m assertFalse(ts.incrementToken()); ts.close(); }
From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java
License:Apache License
/** * Analyzes the given text using the given analyzer and returns the produced tokens. * * @param query The query to analyze. * @param analyzer The analyzer to use./*w ww. j ava2 s . c o m*/ */ protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) { TokenStream tokenStream = null; try { tokenStream = analyzer.tokenStream("", query); final Set<BytesRef> tokens = new HashSet<BytesRef>(); final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); tokenStream.reset(); while (tokenStream.incrementToken()) { bytesAtt.fillBytesRef(); tokens.add(BytesRef.deepCopyOf(bytes)); } tokenStream.end(); return tokens; } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } }
From source file:org.apache.solr.handler.ClassifyStream.java
License:Apache License
@Override public Tuple read() throws IOException { if (modelTuple == null) { modelTuple = modelStream.read(); if (modelTuple == null || modelTuple.EOF) { throw new IOException("Model tuple not found for classify stream!"); }//w w w. j a v a2 s . c om termToIndex = new HashMap<>(); List<String> terms = modelTuple.getStrings("terms_ss"); for (int i = 0; i < terms.size(); i++) { termToIndex.put(terms.get(i), i); } idfs = modelTuple.getDoubles("idfs_ds"); modelWeights = modelTuple.getDoubles("weights_ds"); } Tuple docTuple = docStream.read(); if (docTuple.EOF) return docTuple; String text = docTuple.getString(field); double tfs[] = new double[termToIndex.size()]; TokenStream tokenStream = analyzer.tokenStream(analyzerField, text); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); int termCount = 0; while (tokenStream.incrementToken()) { termCount++; if (termToIndex.containsKey(termAtt.toString())) { tfs[termToIndex.get(termAtt.toString())]++; } } tokenStream.end(); tokenStream.close(); List<Double> tfidfs = new ArrayList<>(termToIndex.size()); tfidfs.add(1.0); for (int i = 0; i < tfs.length; i++) { if (tfs[i] != 0) { tfs[i] = 1 + Math.log(tfs[i]); } tfidfs.add(this.idfs.get(i) * tfs[i]); } double total = 0.0; for (int i = 0; i < tfidfs.size(); i++) { total += tfidfs.get(i) * modelWeights.get(i); } double score = total * ((float) (1.0 / Math.sqrt(termCount))); double positiveProb = sigmoid(total); docTuple.put("probability_d", positiveProb); docTuple.put("score_d", score); return docTuple; }
From source file:org.apache.solr.handler.component.WordCloudComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrQueryRequest req = rb.req;/* ww w .ja va 2 s . c o m*/ SolrParams params = req.getParams(); if (!params.getBool(COMPONENT_NAME, true)) { return; } String wcFields = null; if ((wcFields = params.get("wordcloud.fl", null)) == null) { return; } Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields, ',')); DocList ids = rb.getResults().docList; SolrIndexSearcher searcher = rb.req.getSearcher(); IndexSchema schema = rb.req.getCore().getLatestSchema(); final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer(); final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>(); CharTermAttribute termAtt; Map<String, Map<String, Integer>> tokens = new HashMap<String, Map<String, Integer>>(); for (String f : flds) { SchemaField field = schema.getFieldOrNull(f); if (field == null || !field.stored()) { continue; // ignore this field } fieldsToLoad.put(f, field.getType()); tokens.put(f, new HashMap<String, Integer>()); } DocIterator iterator = ids.iterator(); String w; Integer v; int sz = ids.size(); for (int i = 0; i < sz; i++) { int id = iterator.nextDoc(); Document doc = searcher.doc(id, fieldsToLoad.keySet()); for (Entry<String, FieldType> en : fieldsToLoad.entrySet()) { Map<String, Integer> toks = tokens.get(en.getKey()); String[] vals = doc.getValues(en.getKey()); FieldType fType = en.getValue(); if (vals != null) { for (String s : vals) { TokenStream buffer = analyzer.tokenStream(en.getKey(), new StringReader(fType.indexedToReadable(s))); if (!buffer.hasAttribute(CharTermAttribute.class)) { continue; // empty stream } termAtt = buffer.getAttribute(CharTermAttribute.class); buffer.reset(); while (buffer.incrementToken()) { w = termAtt.toString(); v = toks.get(w); if (v == null) v = 0; toks.put(w, ++v); } buffer.close(); } } } } // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency) AtomicReader reader = searcher.getAtomicReader(); BytesRef term; int df; String f; Map<String, Map<String, Double>> docFreqs = new HashMap<String, Map<String, Double>>(); for (Entry<String, Map<String, Integer>> field : tokens.entrySet()) { HashMap<String, Double> idfs = new HashMap<String, Double>(); f = field.getKey(); docFreqs.put(f, idfs); int N = reader.getDocCount(f); for (Entry<String, Integer> token : field.getValue().entrySet()) { w = token.getKey(); df = reader.docFreq(new Term(f, new BytesRef(w))); if (df != 0) { idfs.put(w, Math.log10(N / df)); } } } HashMap<String, Object> ret = new HashMap<String, Object>(); for (String fi : fieldsToLoad.keySet()) { HashMap<String, Object> va = new HashMap<String, Object>(); va.put("tf", tokens.get(fi)); va.put("idf", docFreqs.get(fi)); ret.put(fi, va); } rb.rsp.add("wordcloud", ret); }
From source file:org.apache.solr.highlight.GapFragmenter.java
License:Apache License
@Override public void start(String originalText, TokenStream tokenStream) { offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); posIncAtt = tokenStream.getAttribute(PositionIncrementAttribute.class); fragOffset = 0;/* w ww. j a va 2 s . c om*/ }
From source file:org.apache.solr.highlight.LuceneRegexFragmenter.java
License:Apache License
@Override public void start(String originalText, TokenStream tokenStream) { currentNumFrags = 1;//from ww w . j a v a 2s.c o m currentOffset = 0; addHotSpots(originalText); posIncAtt = tokenStream.getAttribute(PositionIncrementAttribute.class); offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); }