List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:org.sindice.siren.analysis.filter.URINormalisationFilter.java
License:Apache License
/** * For testing purpose// w ww.ja v a2s .c o m */ public static void main(final String[] args) throws IOException { final TupleTokenizer stream = new TupleTokenizer( new StringReader("" + "<mailto:renaud.delbru@deri.org> <http://renaud.delbru.fr/rdf/foaf> " + "<http://renaud.delbru.fr/> <http://xmlns.com/foaf/0.1/workplaceHomepage/>"), Integer.MAX_VALUE, new WhitespaceAnalyzer(Version.LUCENE_31)); final TokenStream result = new URINormalisationFilter(stream); while (result.incrementToken()) { final CharTermAttribute termAtt = result.getAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = result.getAttribute(PositionIncrementAttribute.class); System.out.println(termAtt.toString() + ", " + posIncrAtt.getPositionIncrement()); } }
From source file:org.sindice.siren.analysis.TestTupleAnalyzer.java
License:Apache License
public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID, final int[] expectedCellID) throws Exception { final TokenStream t = a.reusableTokenStream("", new StringReader(input)); assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class)); final TermAttribute termAtt = t.getAttribute(TermAttribute.class); TypeAttribute typeAtt = null;/*from ww w.j av a 2 s . co m*/ if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } TupleAttribute tupleAtt = null; if (expectedTupleID != null) { assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class)); tupleAtt = t.getAttribute(TupleAttribute.class); } CellAttribute cellAtt = null; if (expectedCellID != null) { assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class)); cellAtt = t.getAttribute(CellAttribute.class); } for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals(expectedImages[i], termAtt.term()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedTupleID != null) { assertEquals(expectedTupleID[i], tupleAtt.tuple()); } if (expectedCellID != null) { assertEquals(expectedCellID[i], cellAtt.cell()); } } assertFalse("end of stream", t.incrementToken()); t.end(); t.close(); }
From source file:org.sindice.siren.qparser.analysis.filter.QNamesFilter.java
License:Apache License
public QNamesFilter(final TokenStream input, final String path) { super(input); cTermAtt = input.getAttribute(CharTermAttribute.class); try {/*w w w. j a va2 s. co m*/ qnames.load(new FileInputStream(path)); } catch (final FileNotFoundException e) { logger.error("QNames mapping file not found", e); throw new RuntimeException("QNames mapping file not found", e); } catch (final IOException e) { logger.error("Parsing of the QNames mapping file failed", e); throw new RuntimeException("Parsing of the QNames mapping file failed", e); } logger.debug("Loading QNames mapping file located at {}", path); }
From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java
License:Apache License
private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, s); CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class); ts.reset();//from ww w . j av a 2 s.c om while (ts.incrementToken()) { set.add(cattr.toString()); } ts.end(); ts.close(); }
From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java
License:Apache License
private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue, TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException { //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true); TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue); stream.reset();//from w w w.ja va2 s .c om int defaultInc = 1; CharTermAttribute termAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); OffsetAttribute offsetAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); PositionIncrementAttribute incAtt = null; if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) { incAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); } while (stream.incrementToken()) { //Do we need this? if (incAtt != null && incAtt.getPositionIncrement() == 0) { continue; } currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc; if (requests.contains(currInd)) { results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase, termAtt.toString()); } if (currInd > requests.getLast()) { // TODO: Is there a way to avoid this? Or, is this // an imaginary performance hit? while (stream.incrementToken()) { //NO-OP } stream.end(); stream.close(); return GOT_ALL_REQUESTS; } } stream.end(); stream.close(); return currInd; }
From source file:org.tallison.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil.java
License:Apache License
/** * allows reuse of terms, this method calls terms.clear() before adding new * terms//from ww w . j a v a 2 s . c o m * * @param s string to analyze * @param field to use in analysis * @param analyzer analyzer * @param terms list for reuse * @return list of strings * @throws java.io.IOException if there's an IOException during analysis */ public static List<String> getTermStrings(String s, String field, Analyzer analyzer, List<String> terms) throws IOException { if (terms == null) { terms = new ArrayList<>(); } terms.clear(); TokenStream stream = analyzer.tokenStream(field, s); stream.reset(); CharTermAttribute termAtt = stream .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); while (stream.incrementToken()) { terms.add(termAtt.toString()); } stream.end(); stream.close(); return terms; }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testBasicNoUnigrams() throws Exception { Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s); tokenStream.reset();/*from w w w .j a v a2 s. c o m*/ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", }); List<String> returned = new ArrayList<>(); while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); assertEquals(1, posIncAttribute.getPositionIncrement()); returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testIncludeUnigrams() throws Exception { List<String> expected = Arrays.asList( new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", }); Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream("f", s); tokenStream.reset();// w w w . j a v a2 s . c o m CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> returned = new ArrayList<>(); int i = 0; while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); if (i++ % 2 == 0) { assertEquals(1, posIncAttribute.getPositionIncrement()); } else { assertEquals(0, posIncAttribute.getPositionIncrement()); } returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testCJKNoUnigrams() throws Exception { final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET; int posIncGap = 10; final int charOffsetGap = 10; Analyzer analyzer = getCJKBigramAnalyzer(false); TokenStream ts = analyzer.tokenStream(FIELD, ""); ts.reset();//from w ww .ja v a 2 s. c o m CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class); ts.end(); ts.close(); String[] docs = new String[] { "" }; Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD))); Query q = new TermQuery(new Term(FIELD, "")); //now test straight and span wrapper ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, q, analyzer, collector); for (ConcordanceWindow w : collector.getWindows()) { //System.out.println(w); } reader.close(); directory.close(); }
From source file:org.usergrid.utils.IndexUtils.java
License:Apache License
public static List<String> keywords(String source) { TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source)); List<String> keywords = new ArrayList<String>(); try {//from w w w. ja v a2 s . co m while (ts.incrementToken()) { keywords.add(ts.getAttribute(TermAttribute.class).term()); } } catch (IOException e) { logger.error("Error getting keywords ", e); } return keywords; }