List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java
License:Apache License
private void assertNumericContents(int value, TokenStream ts) throws IOException { assertTrue(ts instanceof LegacyNumericTokenStream); LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class); ts.reset();/*from w ww . j a v a2 s .c om*/ boolean seen = false; while (ts.incrementToken()) { if (numericAtt.getShift() == 0) { assertEquals(value, numericAtt.getRawValue()); seen = true; } } ts.end(); ts.close(); assertTrue(seen); }
From source file:org.apache.solr.schema.CollationField.java
License:Apache License
/** * analyze the range with the analyzer, instead of the collator. * because jdk collators might not be thread safe (when they are * its just that all methods are synced), this keeps things * simple (we already have a threadlocal clone in the reused TS) *///from ww w . j ava 2 s . c o m private BytesRef analyzeRangePart(String field, String part) { TokenStream source = null; try { source = analyzer.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); // we control the analyzer here: most errors are impossible if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); termAtt.fillBytesRef(); assert !source.incrementToken(); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Unable to analyze range part: " + part, e); } finally { IOUtils.closeQuietly(source); } }
From source file:org.apache.solr.schema.EntityTextField.java
License:Apache License
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; TokenStream source = null; try {//from ww w . jav a2 s . com source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.apache.solr.schema.ICUCollationField.java
License:Apache License
/** * analyze the range with the analyzer, instead of the collator. * because icu collators are not thread safe, this keeps things * simple (we already have a threadlocal clone in the reused TS) *//*ww w .j a v a2 s . c o m*/ private BytesRef analyzeRangePart(String field, String part) { TokenStream source = null; try { source = analyzer.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); // we control the analyzer here: most errors are impossible if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); termAtt.fillBytesRef(); assert !source.incrementToken(); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Unable analyze range part: " + part, e); } finally { IOUtils.closeQuietly(source); } }
From source file:org.apache.solr.schema.JsonPreAnalyzedParser.java
License:Apache License
@Override public String toFormattedString(Field f) throws IOException { Map<String, Object> map = new LinkedHashMap<String, Object>(); map.put(VERSION_KEY, VERSION);//from w ww . jav a2s. c o m if (f.fieldType().stored()) { String stringValue = f.stringValue(); if (stringValue != null) { map.put(STRING_KEY, stringValue); } BytesRef binaryValue = f.binaryValue(); if (binaryValue != null) { map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length)); } } TokenStream ts = f.tokenStreamValue(); if (ts != null) { List<Map<String, Object>> tokens = new LinkedList<Map<String, Object>>(); while (ts.incrementToken()) { Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); String cTerm = null; String tTerm = null; Map<String, Object> tok = new TreeMap<String, Object>(); while (it.hasNext()) { Class<? extends Attribute> cl = it.next(); if (!ts.hasAttribute(cl)) { continue; } Attribute att = ts.getAttribute(cl); if (cl.isAssignableFrom(CharTermAttribute.class)) { CharTermAttribute catt = (CharTermAttribute) att; cTerm = new String(catt.buffer(), 0, catt.length()); } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att; tTerm = tatt.getBytesRef().utf8ToString(); } else { if (cl.isAssignableFrom(FlagsAttribute.class)) { tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags())); } else if (cl.isAssignableFrom(OffsetAttribute.class)) { tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset()); tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset()); } else if (cl.isAssignableFrom(PayloadAttribute.class)) { BytesRef p = ((PayloadAttribute) att).getPayload(); if (p != null && p.length > 0) { tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length)); } } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement()); } else if (cl.isAssignableFrom(TypeAttribute.class)) { tok.put(TYPE_KEY, ((TypeAttribute) att).type()); } else { tok.put(cl.getName(), att.toString()); } } } String term = null; if (cTerm != null) { term = cTerm; } else { term = tTerm; } if (term != null && term.length() > 0) { tok.put(TOKEN_KEY, term); } tokens.add(tok); } map.put(TOKENS_KEY, tokens); } return JSONUtil.toJSON(map, -1); }
From source file:org.apache.solr.schema.SimplePreAnalyzedParser.java
License:Apache License
@Override public String toFormattedString(Field f) throws IOException { StringBuilder sb = new StringBuilder(); sb.append(VERSION + " "); if (f.fieldType().stored()) { String s = f.stringValue(); if (s != null) { // encode the equals sign s = s.replaceAll("=", "\\="); sb.append('='); sb.append(s);// www . j ava 2 s.co m sb.append('='); } } TokenStream ts = f.tokenStreamValue(); if (ts != null) { StringBuilder tok = new StringBuilder(); boolean next = false; while (ts.incrementToken()) { if (next) { sb.append(' '); } else { next = true; } tok.setLength(0); Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); String cTerm = null; String tTerm = null; while (it.hasNext()) { Class<? extends Attribute> cl = it.next(); if (!ts.hasAttribute(cl)) { continue; } Attribute att = ts.getAttribute(cl); if (cl.isAssignableFrom(CharTermAttribute.class)) { CharTermAttribute catt = (CharTermAttribute) att; cTerm = escape(catt.buffer(), catt.length()); } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att; char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray(); tTerm = escape(tTermChars, tTermChars.length); } else { if (tok.length() > 0) tok.append(','); if (cl.isAssignableFrom(FlagsAttribute.class)) { tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags())); } else if (cl.isAssignableFrom(OffsetAttribute.class)) { tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset()); } else if (cl.isAssignableFrom(PayloadAttribute.class)) { BytesRef p = ((PayloadAttribute) att).getPayload(); if (p != null && p.length > 0) { tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length)); } else if (tok.length() > 0) { tok.setLength(tok.length() - 1); // remove the last comma } } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement()); } else if (cl.isAssignableFrom(TypeAttribute.class)) { tok.append("y=" + escape(((TypeAttribute) att).type())); } else { tok.append(cl.getName() + "=" + escape(att.toString())); } } } String term = null; if (cTerm != null) { term = cTerm; } else { term = tTerm; } if (term != null && term.length() > 0) { if (tok.length() > 0) { tok.insert(0, term + ","); } else { tok.insert(0, term); } } sb.append(tok); } } return sb.toString(); }
From source file:org.apache.solr.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
/** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * @param synonymAnalyzer//from w ww . ja v a2 s . c o m * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(getString())); SortedMap<Integer, SortedSet<TextInQuery>> startPosToTextsInQuery = new TreeMap<Integer, SortedSet<TextInQuery>>(); try { while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well TextInQuery textInQuery = new TextInQuery(term.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()); // brain-dead multimap logic... man, I wish we had Google Guava here SortedSet<TextInQuery> existingList = startPosToTextsInQuery.get(offsetAttribute.startOffset()); if (existingList == null) { existingList = new TreeSet<TextInQuery>(); startPosToTextsInQuery.put(offsetAttribute.startOffset(), existingList); } existingList.add(textInQuery); } } } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<List<TextInQuery>>( startPosToTextsInQuery.values().size()); for (SortedSet<TextInQuery> sortedSet : startPosToTextsInQuery.values()) { sortedTextsInQuery.add(new ArrayList<TextInQuery>(sortedSet)); } // have to use the start positions and end positions to figure out all possible combinations List<String> alternateQueries = buildUpAlternateQueries(sortedTextsInQuery); return createSynonymQueries(solrParams, alternateQueries); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testGeneral() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer general = analyzerManager.getGeneralAnalyzer(); TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog"); ts.reset();// www. j a v a2s .c o m CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("the")); assertTrue(seen.contains("and")); assertTrue(seen.contains("dog")); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testCommon() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer common = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog"); ts.reset();/* www. j a v a 2 s . co m*/ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { String t = termAtt.toString(); if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) { fail("Shouldn't have found a numeric"); } seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("dirty")); assertFalse(seen.contains("the")); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testTokenCountFilter() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 101000; i++) { sb.append("the "); }/* www. j a va 2 s .c o m*/ TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString()); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); int tokens = 0; while (ts.incrementToken()) { tokens++; } assertEquals(100000, tokens); }