Example usage for org.apache.lucene.analysis TokenStream TokenStream

List of usage examples for org.apache.lucene.analysis TokenStream TokenStream

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream TokenStream.

Prototype

protected TokenStream() 

Source Link

Document

A TokenStream using the default attribute factory.

Usage

From source file:axiom.objectmodel.dom.ReferenceAnalyzer.java

License:Open Source License

public TokenStream tokenStream(String fieldName, final Reader reader) {
    return new TokenStream() {
        private boolean done = false;
        private static final String DELIM = LuceneManager.NULL_DELIM;

        public Token next() throws IOException {
            if (!done) {
                done = true;// ww w  .j  a va  2s  .co m
                final char[] buffer = new char[512];
                StringBuffer sb = new StringBuffer();
                int length = 0;
                while ((sb.indexOf(DELIM) < 0) && (length = reader.read(buffer)) != -1) {
                    sb.append(buffer, 0, length);
                }
                final String value = sb.toString();
                final int index = value.indexOf(DELIM);
                if (index < 0) {
                    return null;
                } else {
                    final String text = value.substring(0, index);
                    return new Token(text, 0, text.length());
                }
            }

            return null;
        }
    };
}

From source file:com.duroty.lucene.analysis.KeywordAnalyzer.java

License:Apache License

/**
 * DOCUMENT ME!//from  w w  w . j a  v a 2 s.  c o  m
 *
 * @param fieldName DOCUMENT ME!
 * @param reader DOCUMENT ME!
 *
 * @return DOCUMENT ME!
 */
public TokenStream tokenStream(String fieldName, final Reader reader) {
    return new TokenStream() {
        private boolean done;
        private final char[] buffer = new char[1024];

        public Token next() throws IOException {
            if (!done) {
                done = true;

                StringBuffer buffer = new StringBuffer();
                int length = 0;

                while (true) {
                    length = reader.read(this.buffer);

                    if (length == -1) {
                        break;
                    }

                    buffer.append(this.buffer, 0, length);
                }

                String text = buffer.toString();

                return new Token(text, 0, text.length());
            }

            return null;
        }
    };
}

From source file:com.shaie.PhraseVsSpanQuery.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Directory dir = new RAMDirectory();
    final IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    final IndexWriter writer = new IndexWriter(dir, conf);

    final Document doc = new Document();
    doc.add(new TextField("f", new TokenStream() {
        final PositionIncrementAttribute pos = addAttribute(PositionIncrementAttribute.class);
        final CharTermAttribute term = addAttribute(CharTermAttribute.class);
        boolean first = true, done = false;

        @Override/*from  w ww. ja  v a  2s  . c om*/
        public boolean incrementToken() throws IOException {
            if (done) {
                return false;
            }
            if (first) {
                term.setEmpty().append("a");
                pos.setPositionIncrement(1);
                first = false;
            } else {
                term.setEmpty().append("b");
                pos.setPositionIncrement(0);
                done = true;
            }
            return true;
        }
    }));
    writer.addDocument(doc);
    writer.close();

    final DirectoryReader reader = DirectoryReader.open(dir);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final LeafReader ar = reader.leaves().get(0).reader();
    final TermsEnum te = ar.terms("f").iterator();
    BytesRef scratch = new BytesRef();
    while ((scratch = te.next()) != null) {
        System.out.println(scratch.utf8ToString());
        final PostingsEnum dape = ar.postings(new Term("f", scratch.utf8ToString()));
        System.out.println("  doc=" + dape.nextDoc() + ", pos=" + dape.nextPosition());
    }

    System.out.println();

    // try a phrase query with a slop
    final PhraseQuery pqNoSlop = buildPhraseQuery(0);
    System.out.println("searching for \"a b\"; num results = " + searcher.search(pqNoSlop, 10).totalHits);

    final PhraseQuery pqSlop1 = buildPhraseQuery(1);
    System.out.println("searching for \"a b\"~1; num results = " + searcher.search(pqSlop1, 10).totalHits);

    final PhraseQuery pqSlop3 = buildPhraseQuery(3);
    System.out.println("searching for \"a b\"~3; num results = " + searcher.search(pqSlop3, 10).totalHits);

    final SpanNearQuery snqUnOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            false);
    System.out.println("searching for SpanNearUnordered('a', 'b'), slop=1; num results = "
            + searcher.search(snqUnOrdered, 10).totalHits);

    final SpanNearQuery snqOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            true);
    System.out.println("searching for SpanNearOrdered('a', 'b'), slop=1; num results = "
            + searcher.search(snqOrdered, 10).totalHits);

    reader.close();
}

From source file:org.apache.solr.analysis.TestRemoveDuplicatesTokenFilter.java

License:Apache License

public void testDups(final String expected, final Token... tokens) throws Exception {

    final Iterator<Token> toks = Arrays.asList(tokens).iterator();

    final TokenStream ts = new RemoveDuplicatesTokenFilter(new TokenStream() {
        public Token next() {
            return toks.hasNext() ? toks.next() : null;
        }/*from  www  . j  a  va 2  s .co  m*/
    });

    final String actual = TestBufferedTokenStream.tsToString(ts);
    assertEquals(expected + " != " + actual, expected, actual);

}

From source file:org.apache.solr.analysis.TestRemoveDuplicatesTokenFilterFactory.java

License:Apache License

public void testDups(final String expected, final Token... tokens) throws Exception {

    final Iterator<Token> toks = Arrays.asList(tokens).iterator();
    RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory();
    final TokenStream ts = factory.create(new TokenStream() {
        CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

        public boolean incrementToken() {
            if (toks.hasNext()) {
                clearAttributes();/* w ww  .j a  v  a2  s . c o  m*/
                Token tok = toks.next();
                termAtt.setEmpty().append(tok);
                offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
                posIncAtt.setPositionIncrement(tok.getPositionIncrement());
                return true;
            } else {
                return false;
            }
        }
    });

    assertTokenStreamContents(ts, expected.split("\\s"));
}

From source file:org.apache.solr.analysis.TestSynonymFilter.java

License:Apache License

public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
    ArrayList<Token> lst = new ArrayList<Token>();
    final List toks = tokens(input);
    TokenStream ts = new TokenStream() {
        Iterator iter = toks.iterator();

        @Override//from  w w w  .j a v a 2 s  .  co m
        public Token next() throws IOException {
            return iter.hasNext() ? (Token) iter.next() : null;
        }
    };

    SynonymFilter sf = new SynonymFilter(ts, dict);

    Token target = new Token(); // test with token reuse
    while (true) {
        Token t = sf.next(target);
        if (t == null)
            return lst;
        lst.add((Token) t.clone());
    }
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java

License:Apache License

/***
public void testPerformance() throws IOException {
  String s = "now is the time-for all good men to come to-the aid of their country.";
  Token tok = new Token();//ww  w . j a va  2s .  c o  m
  long start = System.currentTimeMillis();
  int ret=0;
  for (int i=0; i<1000000; i++) {
    StringReader r = new StringReader(s);
    TokenStream ts = new WhitespaceTokenizer(r);
    ts = new WordDelimiterFilter(ts, 1,1,1,1,0);
        
    while (ts.next(tok) != null) ret++;
  }
        
  System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
}
***/

public void testOffsets() throws IOException {

    // test that subwords and catenated subwords have
    // the correct offsets.
    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
        Token t;

        public Token next() throws IOException {
            if (t != null)
                return null;
            t = new Token("foo-bar", 5, 12); // actual
            return t;
        }
    }, 1, 1, 0, 0, 1, 1, 0);

    int i = 0;
    for (Token t; (t = wdf.next()) != null;) {
        String termText = new String(t.termBuffer(), 0, t.termLength());
        if (termText.equals("foo")) {
            assertEquals(5, t.startOffset());
            assertEquals(8, t.endOffset());
            i++;
        }
        if (termText.equals("bar")) {
            assertEquals(9, t.startOffset());
            assertEquals(12, t.endOffset());
            i++;
        }
        if (termText.equals("foobar")) {
            assertEquals(5, t.startOffset());
            assertEquals(12, t.endOffset());
            i++;
        }
    }
    assertEquals(3, i); // make sure all 3 tokens were generated

    // test that if splitting or catenating a synonym, that the offsets
    // are not altered (they would be incorrect).
    wdf = new WordDelimiterFilter(new TokenStream() {
        Token t;

        public Token next() throws IOException {
            if (t != null)
                return null;
            t = new Token("foo-bar", 5, 6); // a synonym
            return t;
        }
    }, 1, 1, 0, 0, 1, 1, 0);
    for (Token t; (t = wdf.next()) != null;) {
        assertEquals(5, t.startOffset());
        assertEquals(6, t.endOffset());
    }
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java

License:Apache License

public void testOffsetChange() throws Exception {
    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
        Token t;/*ww  w . ja  va 2s  . com*/

        public Token next() {
            if (t != null)
                return null;
            t = new Token("belkeit)", 7, 16);
            return t;
        }
    }, 1, 1, 0, 0, 1, 1, 0);

    Token t = wdf.next();

    assertNotNull(t);
    assertEquals("belkeit", t.term());
    assertEquals(7, t.startOffset());
    assertEquals(15, t.endOffset());
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java

License:Apache License

public void testOffsetChange2() throws Exception {
    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
        Token t;/*from   w w  w . j  a  v a2s .  c  o  m*/

        public Token next() {
            if (t != null)
                return null;
            t = new Token("(belkeit", 7, 17);
            return t;
        }
    }, 1, 1, 0, 0, 1, 1, 0);

    Token t = wdf.next();

    assertNotNull(t);
    assertEquals("belkeit", t.term());
    assertEquals(8, t.startOffset());
    assertEquals(17, t.endOffset());
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java

License:Apache License

public void testOffsetChange3() throws Exception {
    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
        Token t;/*  w w w .  ja  v a 2  s  .  com*/

        public Token next() {
            if (t != null)
                return null;
            t = new Token("(belkeit", 7, 16);
            return t;
        }
    }, 1, 1, 0, 0, 1, 1, 0);

    Token t = wdf.next();

    assertNotNull(t);
    assertEquals("belkeit", t.term());
    assertEquals(8, t.startOffset());
    assertEquals(16, t.endOffset());
}