List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.apache.solr.handler.ClassifyStream.java
License:Apache License
@Override public Tuple read() throws IOException { if (modelTuple == null) { modelTuple = modelStream.read(); if (modelTuple == null || modelTuple.EOF) { throw new IOException("Model tuple not found for classify stream!"); }/* www . jav a 2 s . c o m*/ termToIndex = new HashMap<>(); List<String> terms = modelTuple.getStrings("terms_ss"); for (int i = 0; i < terms.size(); i++) { termToIndex.put(terms.get(i), i); } idfs = modelTuple.getDoubles("idfs_ds"); modelWeights = modelTuple.getDoubles("weights_ds"); } Tuple docTuple = docStream.read(); if (docTuple.EOF) return docTuple; String text = docTuple.getString(field); double tfs[] = new double[termToIndex.size()]; TokenStream tokenStream = analyzer.tokenStream(analyzerField, text); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); int termCount = 0; while (tokenStream.incrementToken()) { termCount++; if (termToIndex.containsKey(termAtt.toString())) { tfs[termToIndex.get(termAtt.toString())]++; } } tokenStream.end(); tokenStream.close(); List<Double> tfidfs = new ArrayList<>(termToIndex.size()); tfidfs.add(1.0); for (int i = 0; i < tfs.length; i++) { if (tfs[i] != 0) { tfs[i] = 1 + Math.log(tfs[i]); } tfidfs.add(this.idfs.get(i) * tfs[i]); } double total = 0.0; for (int i = 0; i < tfidfs.size(); i++) { total += tfidfs.get(i) * modelWeights.get(i); } double score = total * ((float) (1.0 / Math.sqrt(termCount))); double positiveProb = sigmoid(total); docTuple.put("probability_d", positiveProb); docTuple.put("score_d", score); return docTuple; }
From source file:org.apache.solr.handler.component.QueryElevationComponent.java
License:Apache License
String getAnalyzedQuery(String query) throws IOException { if (analyzer == null) { return query; }//from w ww.j a v a2 s .c om StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", query); try { tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } tokens.end(); return norm.toString(); } finally { IOUtils.closeWhileHandlingException(tokens); } }
From source file:org.apache.solr.handler.component.SpellCheckComponent.java
License:Apache License
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException { Collection<Token> result = new ArrayList<Token>(); assert analyzer != null; TokenStream ts = analyzer.tokenStream("", q); try {//from w ww . java 2 s .c o m ts.reset(); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); while (ts.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setFlags(flagsAtt.getFlags()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } ts.end(); return result; } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java
License:Apache License
private void assertNumericContents(int value, TokenStream ts) throws IOException { assertTrue(ts instanceof LegacyNumericTokenStream); LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class); ts.reset();//from w ww . j ava2s .c o m boolean seen = false; while (ts.incrementToken()) { if (numericAtt.getShift() == 0) { assertEquals(value, numericAtt.getRawValue()); seen = true; } } ts.end(); ts.close(); assertTrue(seen); }
From source file:org.apache.solr.schema.CollationField.java
License:Apache License
/** * analyze the range with the analyzer, instead of the collator. * because jdk collators might not be thread safe (when they are * its just that all methods are synced), this keeps things * simple (we already have a threadlocal clone in the reused TS) *///from w ww. j a va 2 s . c o m private BytesRef analyzeRangePart(String field, String part) { TokenStream source = null; try { source = analyzer.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); // we control the analyzer here: most errors are impossible if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); termAtt.fillBytesRef(); assert !source.incrementToken(); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Unable to analyze range part: " + part, e); } finally { IOUtils.closeQuietly(source); } }
From source file:org.apache.solr.schema.EntityTextField.java
License:Apache License
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; TokenStream source = null; try {//from w w w .j av a2 s .c o m source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.apache.solr.schema.ICUCollationField.java
License:Apache License
/** * analyze the range with the analyzer, instead of the collator. * because icu collators are not thread safe, this keeps things * simple (we already have a threadlocal clone in the reused TS) */// ww w . j a va2s. com private BytesRef analyzeRangePart(String field, String part) { TokenStream source = null; try { source = analyzer.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); // we control the analyzer here: most errors are impossible if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); termAtt.fillBytesRef(); assert !source.incrementToken(); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Unable analyze range part: " + part, e); } finally { IOUtils.closeQuietly(source); } }
From source file:org.apache.solr.spelling.SimpleQueryConverter.java
License:Apache License
@Override public Collection<Token> convert(String origQuery) { Collection<Token> result = new HashSet<Token>(); WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40); TokenStream ts = null; try {/*from www . j ava2 s. c om*/ ts = analyzer.tokenStream("", origQuery); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { Token tok = new Token(); tok.copyBuffer(termAtt.buffer(), 0, termAtt.length()); tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); tok.setFlags(flagsAtt.getFlags()); tok.setPayload(payloadAtt.getPayload()); tok.setPositionIncrement(posIncAtt.getPositionIncrement()); tok.setType(typeAtt.type()); result.add(tok); } ts.end(); return result; } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.apache.solr.spelling.SpellingQueryConverter.java
License:Apache License
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset();//w w w .ja va 2 s . c o m while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
From source file:org.apache.solr.TestTrie.java
License:Apache License
@Test public void testTokenizer() throws Exception { FieldType type = h.getCore().getLatestSchema().getFieldType("tint"); assertTrue(type instanceof TrieField); String value = String.valueOf(random().nextInt()); TokenStream ts = type.getAnalyzer().tokenStream("dummy", value); OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class); ts.reset();/*from w w w . jav a 2s. com*/ int count = 0; while (ts.incrementToken()) { count++; assertEquals(0, ofsAtt.startOffset()); assertEquals(value.length(), ofsAtt.endOffset()); } final int precStep = ((TrieField) type).getPrecisionStep(); assertEquals((32 + precStep - 1) / precStep, count); ts.end(); assertEquals(value.length(), ofsAtt.startOffset()); assertEquals(value.length(), ofsAtt.endOffset()); ts.close(); // Test empty one: ts = type.getAnalyzer().tokenStream("dummy", ""); ts.reset(); assertFalse(ts.incrementToken()); ts.end(); assertEquals(0, ofsAtt.startOffset()); assertEquals(0, ofsAtt.endOffset()); ts.close(); }