List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.apache.solr.analysis.TestBufferedTokenStream.java
License:Apache License
public void testReset() throws Exception { final String input = "How now A B brown A cow B like A B thing?"; Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); TokenStream ts = new AB_AAB_Stream(tokenizer); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); assertTrue(ts.incrementToken());// w ww .ja v a 2s. c o m assertEquals("How", term.toString()); assertTrue(ts.incrementToken()); assertEquals("now", term.toString()); assertTrue(ts.incrementToken()); assertEquals("A", term.toString()); // reset back to input, // if reset() does not work correctly then previous buffered tokens will remain tokenizer.reset(new StringReader(input)); ts.reset(); assertTrue(ts.incrementToken()); assertEquals("How", term.toString()); }
From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java
License:Apache License
/** * Analyzes the given text using the given analyzer and returns the produced tokens. * * @param query The query to analyze. * @param analyzer The analyzer to use./*from w w w . j a v a2 s .c o m*/ */ protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) { TokenStream tokenStream = null; try { tokenStream = analyzer.tokenStream("", query); final Set<BytesRef> tokens = new HashSet<BytesRef>(); final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); tokenStream.reset(); while (tokenStream.incrementToken()) { bytesAtt.fillBytesRef(); tokens.add(BytesRef.deepCopyOf(bytes)); } tokenStream.end(); return tokens; } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } }
From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java
License:Apache License
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream *///from w w w.j av a 2 s .c om private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { final List<AttributeSource> tokens = new ArrayList<AttributeSource>(); final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); try { tokenStream.reset(); int position = 0; while (tokenStream.incrementToken()) { position += posIncrAtt.getPositionIncrement(); trackerAtt.setActPosition(position); tokens.add(tokenStream.cloneAttributes()); } tokenStream.end(); } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } return tokens; }
From source file:org.apache.solr.handler.ClassifyStream.java
License:Apache License
@Override public Tuple read() throws IOException { if (modelTuple == null) { modelTuple = modelStream.read(); if (modelTuple == null || modelTuple.EOF) { throw new IOException("Model tuple not found for classify stream!"); }// w w w . j a va 2 s .c o m termToIndex = new HashMap<>(); List<String> terms = modelTuple.getStrings("terms_ss"); for (int i = 0; i < terms.size(); i++) { termToIndex.put(terms.get(i), i); } idfs = modelTuple.getDoubles("idfs_ds"); modelWeights = modelTuple.getDoubles("weights_ds"); } Tuple docTuple = docStream.read(); if (docTuple.EOF) return docTuple; String text = docTuple.getString(field); double tfs[] = new double[termToIndex.size()]; TokenStream tokenStream = analyzer.tokenStream(analyzerField, text); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); int termCount = 0; while (tokenStream.incrementToken()) { termCount++; if (termToIndex.containsKey(termAtt.toString())) { tfs[termToIndex.get(termAtt.toString())]++; } } tokenStream.end(); tokenStream.close(); List<Double> tfidfs = new ArrayList<>(termToIndex.size()); tfidfs.add(1.0); for (int i = 0; i < tfs.length; i++) { if (tfs[i] != 0) { tfs[i] = 1 + Math.log(tfs[i]); } tfidfs.add(this.idfs.get(i) * tfs[i]); } double total = 0.0; for (int i = 0; i < tfidfs.size(); i++) { total += tfidfs.get(i) * modelWeights.get(i); } double score = total * ((float) (1.0 / Math.sqrt(termCount))); double positiveProb = sigmoid(total); docTuple.put("probability_d", positiveProb); docTuple.put("score_d", score); return docTuple; }
From source file:org.apache.solr.handler.component.QueryElevationComponent.java
License:Apache License
String getAnalyzedQuery(String query) throws IOException { if (analyzer == null) { return query; }/*from www . j ava2 s .c o m*/ StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", query); try { tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } tokens.end(); return norm.toString(); } finally { IOUtils.closeWhileHandlingException(tokens); } }
From source file:org.apache.solr.handler.component.SpellCheckComponent.java
License:Apache License
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException { Collection<Token> result = new ArrayList<Token>(); assert analyzer != null; TokenStream ts = analyzer.tokenStream("", q); try {// w w w.j a v a 2 s. c o m ts.reset(); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); while (ts.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setFlags(flagsAtt.getFlags()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } ts.end(); return result; } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.apache.solr.handler.component.WordCloudComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrQueryRequest req = rb.req;/*from w w w .j av a 2 s .com*/ SolrParams params = req.getParams(); if (!params.getBool(COMPONENT_NAME, true)) { return; } String wcFields = null; if ((wcFields = params.get("wordcloud.fl", null)) == null) { return; } Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields, ',')); DocList ids = rb.getResults().docList; SolrIndexSearcher searcher = rb.req.getSearcher(); IndexSchema schema = rb.req.getCore().getLatestSchema(); final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer(); final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>(); CharTermAttribute termAtt; Map<String, Map<String, Integer>> tokens = new HashMap<String, Map<String, Integer>>(); for (String f : flds) { SchemaField field = schema.getFieldOrNull(f); if (field == null || !field.stored()) { continue; // ignore this field } fieldsToLoad.put(f, field.getType()); tokens.put(f, new HashMap<String, Integer>()); } DocIterator iterator = ids.iterator(); String w; Integer v; int sz = ids.size(); for (int i = 0; i < sz; i++) { int id = iterator.nextDoc(); Document doc = searcher.doc(id, fieldsToLoad.keySet()); for (Entry<String, FieldType> en : fieldsToLoad.entrySet()) { Map<String, Integer> toks = tokens.get(en.getKey()); String[] vals = doc.getValues(en.getKey()); FieldType fType = en.getValue(); if (vals != null) { for (String s : vals) { TokenStream buffer = analyzer.tokenStream(en.getKey(), new StringReader(fType.indexedToReadable(s))); if (!buffer.hasAttribute(CharTermAttribute.class)) { continue; // empty stream } termAtt = buffer.getAttribute(CharTermAttribute.class); buffer.reset(); while (buffer.incrementToken()) { w = termAtt.toString(); v = toks.get(w); if (v == null) v = 0; toks.put(w, ++v); } buffer.close(); } } } } // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency) AtomicReader reader = searcher.getAtomicReader(); BytesRef term; int df; String f; Map<String, Map<String, Double>> docFreqs = new HashMap<String, Map<String, Double>>(); for (Entry<String, Map<String, Integer>> field : tokens.entrySet()) { HashMap<String, Double> idfs = new HashMap<String, Double>(); f = field.getKey(); docFreqs.put(f, idfs); int N = reader.getDocCount(f); for (Entry<String, Integer> token : field.getValue().entrySet()) { w = token.getKey(); df = reader.docFreq(new Term(f, new BytesRef(w))); if (df != 0) { idfs.put(w, Math.log10(N / df)); } } } HashMap<String, Object> ret = new HashMap<String, Object>(); for (String fi : fieldsToLoad.keySet()) { HashMap<String, Object> va = new HashMap<String, Object>(); va.put("tf", tokens.get(fi)); va.put("idf", docFreqs.get(fi)); ret.put(fi, va); } rb.rsp.add("wordcloud", ret); }
From source file:org.apache.solr.handler.DumpIndexField.java
License:Apache License
private void runSynchronously(RequestQueue queue, SolrQueryRequest req) throws MalformedURLException, IOException, InterruptedException { SolrCore core = req.getCore();//from w ww. java2 s .c om IndexSchema schema = req.getSchema(); RequestData data = queue.pop(); if (!allowed.matcher(data.sourceField).matches()) { data.msg("Export of this field is not allowed: " + data.sourceField); queue.registerFailedBatch(data); return; } SchemaField field = core.getSchema().getFieldOrNull(data.sourceField); if (field == null || !field.stored()) { data.msg("We cannot dump fields that are not stored: " + data.sourceField); queue.registerFailedBatch(data); return; } final Analyzer analyzer = core.getSchema().getQueryAnalyzer(); SchemaField targetField = core.getSchema().getFieldOrNull(data.targetField); if (targetField == null) { data.msg("We cannot find analyzer for: " + data.targetField); queue.registerFailedBatch(data); return; } final String targetAnalyzer = data.targetField; DirectoryReader ir = req.getSearcher().getIndexReader(); SolrIndexSearcher se = req.getSearcher(); final HashSet<String> fieldsToLoad = new HashSet<String>(); fieldsToLoad.add(data.sourceField); se.search(new MatchAllDocsQuery(), new Collector() { private AtomicReader reader; private int i = 0; @Override public boolean acceptsDocsOutOfOrder() { return true; } @Override public void collect(int i) { Document d; try { d = reader.document(i, fieldsToLoad); for (String f : fieldsToLoad) { String[] vals = d.getValues(f); for (String s : vals) { TokenStream ts = analyzer.tokenStream(targetAnalyzer, new StringReader(s)); ts.reset(); while (ts.incrementToken()) { //pass } } } } catch (IOException e) { // pass } } @Override public void setNextReader(AtomicReaderContext context) { this.reader = context.reader(); } @Override public void setScorer(org.apache.lucene.search.Scorer scorer) { // Do Nothing } }); // persist the data TokenStream ts = analyzer.tokenStream(data.targetField, new StringReader("xxx")); ts.reset(); ts.reset(); ts.reset(); }
From source file:org.apache.solr.highlight.DefaultSolrHighlighter.java
License:Apache License
private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { final SolrIndexSearcher searcher = req.getSearcher(); final IndexSchema schema = searcher.getSchema(); // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - // so we disable them until fixed (see LUCENE-3080)! // BEGIN: Hack final SchemaField schemaField = schema.getFieldOrNull(fieldName); if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField) || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return;//from www . j ava 2s . c om // END: Hack SolrParams params = req.getParams(); // preserve order of values in a multiValued list boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); List<IndexableField> allFields = doc.getFields(); if (allFields != null && allFields.size() == 0) return; // No explicit contract that getFields returns != null, // although currently it can't. TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); if (tvStream != null) { tots = new TermOffsetsTokenStream(tvStream); } int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, Integer.toString(Integer.MAX_VALUE))); int mvToMatch = Integer.parseInt( req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.toString(Integer.MAX_VALUE))); for (IndexableField thisField : allFields) { if (mvToExamine <= 0 || mvToMatch <= 0) break; if (!thisField.name().equals(fieldName)) continue; // Is there a better way to do this? --mvToExamine; String thisText = thisField.stringValue(); if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tstream = tots.getMultiValuedTokenStream(thisText.length()); } else { // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, thisText); } int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { if (maxCharsToAnalyze < 0) { tstream = new CachingTokenFilter(tstream); } else { tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter already used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(thisText.length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if (preserveMulti) { if (bestTextFragments[k] != null) { frags.add(bestTextFragments[k]); --mvToMatch; } } else { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); --mvToMatch; } } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first if (!preserveMulti) { Collections.sort(frags, new Comparator<TextFragment>() { @Override public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); } // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if (preserveMulti) { if (fragment != null) { fragTexts.add(fragment.toString()); } } else { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } } if (fragTexts.size() >= numFragments && !preserveMulti) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
From source file:org.apache.solr.highlight.DefaultSolrHighlighter.java
License:Apache License
private TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText) throws IOException { TokenStream tstream;//from w w w .ja v a2s . c o m TokenStream ts = schema.getAnalyzer().tokenStream(fieldName, docText); ts.reset(); tstream = new TokenOrderingFilter(ts, 10); return tstream; }