Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:nl.b3p.viewer.stripes.CatalogSearchActionBean.java

License:Open Source License

private static Or createOrFilter(String queryString, String propertyName) {
    List orList = new ArrayList();
    queryString = createQueryString(queryString, false);
    if (queryString != null && !queryString.trim().equals(defaultWildCard)) {

        propertyName = createPropertyName(propertyName);

        PropertyIsEqualTo propertyIsEqualTo = FilterCreator.createPropertyIsEqualTo(queryString, propertyName);

        StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_45,
                DutchAnalyzer.getDefaultStopSet());

        orList.add(propertyIsEqualTo);/*w  w w  .  ja  va 2 s  .com*/
        try {

            TokenStream tokenStream = standardAnalyzer.tokenStream("", queryString);
            OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
            CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                int startOffset = offsetAttribute.startOffset();
                int endOffset = offsetAttribute.endOffset();
                String term = charTermAttribute.toString();
                PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(term, propertyName);
                orList.add(propertyIsLike);
            }
            tokenStream.close();
        } catch (IOException e) {
            PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(queryString, propertyName);
            orList.add(propertyIsLike);
        }
    }

    Or or = new Or(new BinaryLogicOpType(orList));

    return or;
}

From source file:nl.cwi.helpers.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text.
 * Can handle ngrams of any length and also perform stop word removal before extraction
 * @param text the text that the ngrams should be extracted from
 * @param minLength the minimum length of the ngrams
 * @param maxLength the maximum length of the ngrams
 * @param stopWords whether or not stopwords should be removed before extraction
 *//* www . j  a  v  a  2s .co m*/
public void extract(String text, int minLength, int maxLength, Boolean stopWords)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.minLength = minLength;
    this.maxLength = maxLength;
    this.stopWords = stopWords;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /* If the minLength and maxLength are both 1, then we want unigrams
     * Make use of a StopAnalyzer when stopwords should be removed
     * Make use of a SimpleAnalyzer when stop words should be included
     */
    if ((minLength == 1) && (maxLength == 1)) {
        if (this.stopWords) {
            analyzer = new StopAnalyzer(Version.LUCENE_43);
        } else {
            analyzer = new SimpleAnalyzer(Version.LUCENE_43);
        }
    }

    else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_42), minLength, maxLength,
                    " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_42), minLength, maxLength,
                    " ", false, false);
        }
    }

    //Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    int tokenCount = 0;
    tokenStream.reset();
    //System.out.println("So this is:" + charTermAttribute.toString() );

    while (tokenStream.incrementToken()) {
        //System.out.println("Lets see");
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); //The actual token term
        nGrams.add(termToken); //Add all ngrams to the ngram LinkedList

    }

    //Store unique nGrams and frequencies in hash tables

    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:nl.inl.blacklab.analysis.BLLatinAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String TEST_STR = "H jij daar!";

    Analyzer a = new BLLatinAnalyzer();
    try {//from ww w .  j  a  va2  s  .c  o m
        TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
        TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
                new StringReader(TEST_STR));
        ta = ts2.addAttribute(CharTermAttribute.class);
        while (ts2.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
    } finally {
        a.close();
    }
}

From source file:nl.inl.blacklab.analysis.BLStandardAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String TEST_STR = "H jij !  ?  ?. ]'      ??. ";

    Analyzer a = new BLStandardAnalyzer();
    try {//from w w  w  . ja  v a  2 s  .co  m
        TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
        TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
                new StringReader(TEST_STR));
        ta = ts2.addAttribute(CharTermAttribute.class);
        while (ts2.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
    } finally {
        a.close();
    }
}

From source file:nl.inl.blacklab.analysis.TestBLDutchAnalyzer.java

License:Apache License

@Test
public void testBasics() throws IOException {
    Reader r = new StringReader("1781 \"hond, a.u.b.: bl(len); \t [pre]cursor \t\nzo'n 'Hij zij' ex-man -");
    BLDutchAnalyzer analyzer = new BLDutchAnalyzer();
    try {// w w  w.  j  a v a 2  s  . c o  m
        TokenStream ts = analyzer.tokenStream("contents", r);
        try {
            CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("1781", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("aub", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("bellen", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("precursor", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("hij", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("zij", new String(ta.buffer(), 0, ta.length()));
            Assert.assertTrue(ts.incrementToken());
            Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length()));
            Assert.assertFalse(ts.incrementToken());
        } finally {
            ts.close();
        }
    } finally {
        analyzer.close();
    }
}

From source file:nl.inl.blacklab.analysis.TestBLDutchTokenFilter.java

License:Apache License

@Test
public void testBasics() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "hond", "a.u.b.", "bel(len)", "[pre]cursor", "zo'n",
            "'Hij", "zij'", "ex-man", "-" });
    try {/*from w ww.  j a  v  a  2 s .  co m*/
        ts = new BLDutchTokenFilter(ts);
        ts.reset();
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("aub", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("bellen", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("precursor", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Hij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("zij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.analysis.TestBLDutchTokenizer.java

License:Apache License

@Test
public void testBasics() throws IOException {
    Reader r = new StringReader("\"hond, a.u.b.: bl(len); \t [pre]cursor \t\nzo'n 'Hij zij' ex-man -");
    TokenStream ts = new BLDutchTokenizer(r);
    ts.reset();//  ww  w .ja  v  a2 s.co  m
    try {
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("a.u.b.", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("bl(len)", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("[pre]cursor", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("'Hij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("zij'", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("-", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.AbstractSynonymFilter.java

License:Apache License

/**
 * @param args/*  ww  w  .j  ava2 s.c  o  m*/
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Dit is een test"));
    try {
        ts = new AbstractSynonymFilter(ts) {
            @Override
            public String[] getSynonyms(String s) {
                if (s.equals("test"))
                    return new String[] { "testje" };
                if (s.equals("is"))
                    return new String[] { "zijn" };
                return null;
            }
        };

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestRemoveAllAccentsFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "H", "jij" });
    try {/* w w w  .  ja  v  a  2s  . c o  m*/
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("He", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("jij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {//from  w w  w  .j av a  2  s  .c  o  m
        ts = new TranscribeGermanAccentsFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}