List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:org.apache.solr.handler.DumpIndexField.java
License:Apache License
private void runSynchronously(RequestQueue queue, SolrQueryRequest req) throws MalformedURLException, IOException, InterruptedException { SolrCore core = req.getCore();/* ww w . j a v a 2 s . c o m*/ IndexSchema schema = req.getSchema(); RequestData data = queue.pop(); if (!allowed.matcher(data.sourceField).matches()) { data.msg("Export of this field is not allowed: " + data.sourceField); queue.registerFailedBatch(data); return; } SchemaField field = core.getSchema().getFieldOrNull(data.sourceField); if (field == null || !field.stored()) { data.msg("We cannot dump fields that are not stored: " + data.sourceField); queue.registerFailedBatch(data); return; } final Analyzer analyzer = core.getSchema().getQueryAnalyzer(); SchemaField targetField = core.getSchema().getFieldOrNull(data.targetField); if (targetField == null) { data.msg("We cannot find analyzer for: " + data.targetField); queue.registerFailedBatch(data); return; } final String targetAnalyzer = data.targetField; DirectoryReader ir = req.getSearcher().getIndexReader(); SolrIndexSearcher se = req.getSearcher(); final HashSet<String> fieldsToLoad = new HashSet<String>(); fieldsToLoad.add(data.sourceField); se.search(new MatchAllDocsQuery(), new Collector() { private AtomicReader reader; private int i = 0; @Override public boolean acceptsDocsOutOfOrder() { return true; } @Override public void collect(int i) { Document d; try { d = reader.document(i, fieldsToLoad); for (String f : fieldsToLoad) { String[] vals = d.getValues(f); for (String s : vals) { TokenStream ts = analyzer.tokenStream(targetAnalyzer, new StringReader(s)); ts.reset(); while (ts.incrementToken()) { //pass } } } } catch (IOException e) { // pass } } @Override public void setNextReader(AtomicReaderContext context) { this.reader = context.reader(); } @Override public void setScorer(org.apache.lucene.search.Scorer scorer) { // Do Nothing } }); // persist the data TokenStream ts = analyzer.tokenStream(data.targetField, new StringReader("xxx")); ts.reset(); ts.reset(); ts.reset(); }
From source file:org.apache.solr.highlight.HighlighterTest.java
License:Apache License
@Test public void testTermOffsetsTokenStream() throws Exception { String[] multivalued = { "a b c d", "e f g", "h", "i j k l m n" }; Analyzer a1 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); TokenStream tokenStream = a1.tokenStream("", "a b c d e f g h i j k l m n"); tokenStream.reset();//from w w w . ja v a 2 s .com TermOffsetsTokenStream tots = new TermOffsetsTokenStream(tokenStream); for (String v : multivalued) { TokenStream ts1 = tots.getMultiValuedTokenStream(v.length()); Analyzer a2 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); TokenStream ts2 = a2.tokenStream("", v); ts2.reset(); while (ts1.incrementToken()) { assertTrue(ts2.incrementToken()); assertEquals(ts1, ts2); } assertFalse(ts2.incrementToken()); } }
From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java
License:Apache License
private void assertNumericContents(int value, TokenStream ts) throws IOException { assertTrue(ts instanceof LegacyNumericTokenStream); LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class); ts.reset();//from www . jav a2 s . c o m boolean seen = false; while (ts.incrementToken()) { if (numericAtt.getShift() == 0) { assertEquals(value, numericAtt.getRawValue()); seen = true; } } ts.end(); ts.close(); assertTrue(seen); }
From source file:org.apache.solr.schema.CollationField.java
License:Apache License
/** * analyze the range with the analyzer, instead of the collator. * because jdk collators might not be thread safe (when they are * its just that all methods are synced), this keeps things * simple (we already have a threadlocal clone in the reused TS) *//* w ww . j a va2 s .co m*/ private BytesRef analyzeRangePart(String field, String part) { TokenStream source = null; try { source = analyzer.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); // we control the analyzer here: most errors are impossible if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); termAtt.fillBytesRef(); assert !source.incrementToken(); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Unable to analyze range part: " + part, e); } finally { IOUtils.closeQuietly(source); } }
From source file:org.apache.solr.schema.EntityTextField.java
License:Apache License
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; TokenStream source = null; try {//from w w w.ja v a2 s .co m source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.apache.solr.schema.ICUCollationField.java
License:Apache License
/** * analyze the range with the analyzer, instead of the collator. * because icu collators are not thread safe, this keeps things * simple (we already have a threadlocal clone in the reused TS) *//* www. ja v a2 s .com*/ private BytesRef analyzeRangePart(String field, String part) { TokenStream source = null; try { source = analyzer.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); // we control the analyzer here: most errors are impossible if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); termAtt.fillBytesRef(); assert !source.incrementToken(); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Unable analyze range part: " + part, e); } finally { IOUtils.closeQuietly(source); } }
From source file:org.apache.solr.schema.JsonPreAnalyzedParser.java
License:Apache License
@Override public String toFormattedString(Field f) throws IOException { Map<String, Object> map = new LinkedHashMap<String, Object>(); map.put(VERSION_KEY, VERSION);/*from w w w .ja v a 2s . c o m*/ if (f.fieldType().stored()) { String stringValue = f.stringValue(); if (stringValue != null) { map.put(STRING_KEY, stringValue); } BytesRef binaryValue = f.binaryValue(); if (binaryValue != null) { map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length)); } } TokenStream ts = f.tokenStreamValue(); if (ts != null) { List<Map<String, Object>> tokens = new LinkedList<Map<String, Object>>(); while (ts.incrementToken()) { Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); String cTerm = null; String tTerm = null; Map<String, Object> tok = new TreeMap<String, Object>(); while (it.hasNext()) { Class<? extends Attribute> cl = it.next(); if (!ts.hasAttribute(cl)) { continue; } Attribute att = ts.getAttribute(cl); if (cl.isAssignableFrom(CharTermAttribute.class)) { CharTermAttribute catt = (CharTermAttribute) att; cTerm = new String(catt.buffer(), 0, catt.length()); } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att; tTerm = tatt.getBytesRef().utf8ToString(); } else { if (cl.isAssignableFrom(FlagsAttribute.class)) { tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags())); } else if (cl.isAssignableFrom(OffsetAttribute.class)) { tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset()); tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset()); } else if (cl.isAssignableFrom(PayloadAttribute.class)) { BytesRef p = ((PayloadAttribute) att).getPayload(); if (p != null && p.length > 0) { tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length)); } } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement()); } else if (cl.isAssignableFrom(TypeAttribute.class)) { tok.put(TYPE_KEY, ((TypeAttribute) att).type()); } else { tok.put(cl.getName(), att.toString()); } } } String term = null; if (cTerm != null) { term = cTerm; } else { term = tTerm; } if (term != null && term.length() > 0) { tok.put(TOKEN_KEY, term); } tokens.add(tok); } map.put(TOKENS_KEY, tokens); } return JSONUtil.toJSON(map, -1); }
From source file:org.apache.solr.schema.SimplePreAnalyzedParser.java
License:Apache License
@Override public String toFormattedString(Field f) throws IOException { StringBuilder sb = new StringBuilder(); sb.append(VERSION + " "); if (f.fieldType().stored()) { String s = f.stringValue(); if (s != null) { // encode the equals sign s = s.replaceAll("=", "\\="); sb.append('='); sb.append(s);/* w ww . j a v a2s . co m*/ sb.append('='); } } TokenStream ts = f.tokenStreamValue(); if (ts != null) { StringBuilder tok = new StringBuilder(); boolean next = false; while (ts.incrementToken()) { if (next) { sb.append(' '); } else { next = true; } tok.setLength(0); Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); String cTerm = null; String tTerm = null; while (it.hasNext()) { Class<? extends Attribute> cl = it.next(); if (!ts.hasAttribute(cl)) { continue; } Attribute att = ts.getAttribute(cl); if (cl.isAssignableFrom(CharTermAttribute.class)) { CharTermAttribute catt = (CharTermAttribute) att; cTerm = escape(catt.buffer(), catt.length()); } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att; char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray(); tTerm = escape(tTermChars, tTermChars.length); } else { if (tok.length() > 0) tok.append(','); if (cl.isAssignableFrom(FlagsAttribute.class)) { tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags())); } else if (cl.isAssignableFrom(OffsetAttribute.class)) { tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset()); } else if (cl.isAssignableFrom(PayloadAttribute.class)) { BytesRef p = ((PayloadAttribute) att).getPayload(); if (p != null && p.length > 0) { tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length)); } else if (tok.length() > 0) { tok.setLength(tok.length() - 1); // remove the last comma } } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement()); } else if (cl.isAssignableFrom(TypeAttribute.class)) { tok.append("y=" + escape(((TypeAttribute) att).type())); } else { tok.append(cl.getName() + "=" + escape(att.toString())); } } } String term = null; if (cTerm != null) { term = cTerm; } else { term = tTerm; } if (term != null && term.length() > 0) { if (tok.length() > 0) { tok.insert(0, term + ","); } else { tok.insert(0, term); } } sb.append(tok); } } return sb.toString(); }
From source file:org.apache.solr.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
/** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * @param synonymAnalyzer/* www .j a v a2 s . c om*/ * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(getString())); SortedMap<Integer, SortedSet<TextInQuery>> startPosToTextsInQuery = new TreeMap<Integer, SortedSet<TextInQuery>>(); try { while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well TextInQuery textInQuery = new TextInQuery(term.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()); // brain-dead multimap logic... man, I wish we had Google Guava here SortedSet<TextInQuery> existingList = startPosToTextsInQuery.get(offsetAttribute.startOffset()); if (existingList == null) { existingList = new TreeSet<TextInQuery>(); startPosToTextsInQuery.put(offsetAttribute.startOffset(), existingList); } existingList.add(textInQuery); } } } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<List<TextInQuery>>( startPosToTextsInQuery.values().size()); for (SortedSet<TextInQuery> sortedSet : startPosToTextsInQuery.values()) { sortedTextsInQuery.add(new ArrayList<TextInQuery>(sortedSet)); } // have to use the start positions and end positions to figure out all possible combinations List<String> alternateQueries = buildUpAlternateQueries(sortedTextsInQuery); return createSynonymQueries(solrParams, alternateQueries); }
From source file:org.apache.solr.spelling.SimpleQueryConverter.java
License:Apache License
@Override public Collection<Token> convert(String origQuery) { Collection<Token> result = new HashSet<Token>(); WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40); TokenStream ts = null; try {/*from w ww . j a va 2 s . c om*/ ts = analyzer.tokenStream("", origQuery); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { Token tok = new Token(); tok.copyBuffer(termAtt.buffer(), 0, termAtt.length()); tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); tok.setFlags(flagsAtt.getFlags()); tok.setPayload(payloadAtt.getPayload()); tok.setPositionIncrement(posIncAtt.getPositionIncrement()); tok.setType(typeAtt.type()); result.add(tok); } ts.end(); return result; } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeWhileHandlingException(ts); } }