Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:nl.b3p.viewer.stripes.CatalogSearchActionBean.java

License:Open Source License

private static Or createOrFilter(String queryString, String propertyName) {
    List orList = new ArrayList();
    queryString = createQueryString(queryString, false);
    if (queryString != null && !queryString.trim().equals(defaultWildCard)) {

        propertyName = createPropertyName(propertyName);

        PropertyIsEqualTo propertyIsEqualTo = FilterCreator.createPropertyIsEqualTo(queryString, propertyName);

        StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_45,
                DutchAnalyzer.getDefaultStopSet());

        orList.add(propertyIsEqualTo);/*w  w w  .  ja  va 2 s  .com*/
        try {

            TokenStream tokenStream = standardAnalyzer.tokenStream("", queryString);
            OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
            CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                int startOffset = offsetAttribute.startOffset();
                int endOffset = offsetAttribute.endOffset();
                String term = charTermAttribute.toString();
                PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(term, propertyName);
                orList.add(propertyIsLike);
            }
            tokenStream.close();
        } catch (IOException e) {
            PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(queryString, propertyName);
            orList.add(propertyIsLike);
        }
    }

    Or or = new Or(new BinaryLogicOpType(orList));

    return or;
}

From source file:nl.cwi.helpers.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text.
 * Can handle ngrams of any length and also perform stop word removal before extraction
 * @param text the text that the ngrams should be extracted from
 * @param minLength the minimum length of the ngrams
 * @param maxLength the maximum length of the ngrams
 * @param stopWords whether or not stopwords should be removed before extraction
 *//* www . j  a  v  a  2s .co m*/
public void extract(String text, int minLength, int maxLength, Boolean stopWords)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.minLength = minLength;
    this.maxLength = maxLength;
    this.stopWords = stopWords;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /* If the minLength and maxLength are both 1, then we want unigrams
     * Make use of a StopAnalyzer when stopwords should be removed
     * Make use of a SimpleAnalyzer when stop words should be included
     */
    if ((minLength == 1) && (maxLength == 1)) {
        if (this.stopWords) {
            analyzer = new StopAnalyzer(Version.LUCENE_43);
        } else {
            analyzer = new SimpleAnalyzer(Version.LUCENE_43);
        }
    }

    else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_42), minLength, maxLength,
                    " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_42), minLength, maxLength,
                    " ", false, false);
        }
    }

    //Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    int tokenCount = 0;
    tokenStream.reset();
    //System.out.println("So this is:" + charTermAttribute.toString() );

    while (tokenStream.incrementToken()) {
        //System.out.println("Lets see");
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); //The actual token term
        nGrams.add(termToken); //Add all ngrams to the ngram LinkedList

    }

    //Store unique nGrams and frequencies in hash tables

    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:nl.inl.blacklab.analysis.BLLatinAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String TEST_STR = "H jij daar!";

    Analyzer a = new BLLatinAnalyzer();
    try {//from ww w .  j  a  va2  s  .c  o m
        TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
        TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
                new StringReader(TEST_STR));
        ta = ts2.addAttribute(CharTermAttribute.class);
        while (ts2.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
    } finally {
        a.close();
    }
}

From source file:nl.inl.blacklab.analysis.BLStandardAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String TEST_STR = "H jij !  ?  ?. ]'      ??. ";

    Analyzer a = new BLStandardAnalyzer();
    try {//from w w  w  . ja  v a  2 s  .co  m
        TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
        TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
                new StringReader(TEST_STR));
        ta = ts2.addAttribute(CharTermAttribute.class);
        while (ts2.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
    } finally {
        a.close();
    }
}

From source file:nl.inl.blacklab.analysis.TestBLDutchAnalyzer.java

License:Apache License

@Test
public void testBasics() throws IOException {
    Reader r = new StringReader("1781 \"hond, a.u.b.: bl(len); \t [pre]cursor \t\nzo'n 'Hij zij' ex-man -");
    BLDutchAnalyzer analyzer = new BLDutchAnalyzer();
    try {// w w  w.  j  a v a 2  s  . c o  m
        TokenStream ts = analyzer.tokenStream("contents", r);
        try {
            CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("1781", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("aub", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("bellen", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("precursor", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("hij", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("zij", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length()));
            Assert.assertFalse(ts.incrementToken());
        } finally {
            ts.close();
        }
    } finally {
        analyzer.close();
    }
}

From source file:nl.inl.blacklab.analysis.TestBLDutchTokenFilter.java

License:Apache License

@Test
public void testBasics() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "hond", "a.u.b.", "bel(len)", "[pre]cursor", "zo'n",
            "'Hij", "zij'", "ex-man", "-" });
    try {/*from w ww.  j a  v  a  2 s .  co m*/
        ts = new BLDutchTokenFilter(ts);
        ts.reset();
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("aub", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("bellen", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("precursor", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Hij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("zij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.analysis.TestBLDutchTokenizer.java

License:Apache License

@Test
public void testBasics() throws IOException {
    Reader r = new StringReader("\"hond, a.u.b.: bl(len); \t [pre]cursor \t\nzo'n 'Hij zij' ex-man -");
    TokenStream ts = new BLDutchTokenizer(r);
    ts.reset();//  ww  w .ja  v  a2 s.co  m
    try {
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("a.u.b.", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("bl(len)", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("[pre]cursor", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("'Hij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("zij'", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("-", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.AbstractSynonymFilter.java

License:Apache License

/**
 * @param args/*  ww  w  .j  ava2 s.c  o  m*/
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Dit is een test"));
    try {
        ts = new AbstractSynonymFilter(ts) {
            @Override
            public String[] getSynonyms(String s) {
                if (s.equals("test"))
                    return new String[] { "testje" };
                if (s.equals("is"))
                    return new String[] { "zijn" };
                return null;
            }
        };

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestRemoveAllAccentsFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "H", "jij" });
    try {/* w w w  .  ja  v  a  2s  . c o  m*/
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("He", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("jij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {//from  w w  w  .j av a  2  s  .c  o  m
        ts = new TranscribeGermanAccentsFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}