List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:nl.b3p.viewer.stripes.CatalogSearchActionBean.java
License:Open Source License
private static Or createOrFilter(String queryString, String propertyName) { List orList = new ArrayList(); queryString = createQueryString(queryString, false); if (queryString != null && !queryString.trim().equals(defaultWildCard)) { propertyName = createPropertyName(propertyName); PropertyIsEqualTo propertyIsEqualTo = FilterCreator.createPropertyIsEqualTo(queryString, propertyName); StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_45, DutchAnalyzer.getDefaultStopSet()); orList.add(propertyIsEqualTo);/*w w w . ja va 2 s .com*/ try { TokenStream tokenStream = standardAnalyzer.tokenStream("", queryString); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(term, propertyName); orList.add(propertyIsLike); } tokenStream.close(); } catch (IOException e) { PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(queryString, propertyName); orList.add(propertyIsLike); } } Or or = new Or(new BinaryLogicOpType(orList)); return or; }
From source file:nl.cwi.helpers.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. * Can handle ngrams of any length and also perform stop word removal before extraction * @param text the text that the ngrams should be extracted from * @param minLength the minimum length of the ngrams * @param maxLength the maximum length of the ngrams * @param stopWords whether or not stopwords should be removed before extraction *//* www . j a v a 2s .co m*/ public void extract(String text, int minLength, int maxLength, Boolean stopWords) throws FileNotFoundException, IOException { this.text = text; this.minLength = minLength; this.maxLength = maxLength; this.stopWords = stopWords; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* If the minLength and maxLength are both 1, then we want unigrams * Make use of a StopAnalyzer when stopwords should be removed * Make use of a SimpleAnalyzer when stop words should be included */ if ((minLength == 1) && (maxLength == 1)) { if (this.stopWords) { analyzer = new StopAnalyzer(Version.LUCENE_43); } else { analyzer = new SimpleAnalyzer(Version.LUCENE_43); } } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_42), minLength, maxLength, " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_42), minLength, maxLength, " ", false, false); } } //Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); int tokenCount = 0; tokenStream.reset(); //System.out.println("So this is:" + charTermAttribute.toString() ); while (tokenStream.incrementToken()) { //System.out.println("Lets see"); int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); //The actual token term nGrams.add(termToken); //Add all ngrams to the ngram LinkedList } //Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }
From source file:nl.inl.blacklab.analysis.BLLatinAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String TEST_STR = "H jij daar!"; Analyzer a = new BLLatinAnalyzer(); try {//from ww w . j a va2 s .c o m TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR)); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"), new StringReader(TEST_STR)); ta = ts2.addAttribute(CharTermAttribute.class); while (ts2.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } } finally { a.close(); } }
From source file:nl.inl.blacklab.analysis.BLStandardAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String TEST_STR = "H jij ! ? ?. ]' ??. "; Analyzer a = new BLStandardAnalyzer(); try {//from w w w . ja v a 2 s .co m TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR)); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"), new StringReader(TEST_STR)); ta = ts2.addAttribute(CharTermAttribute.class); while (ts2.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } } finally { a.close(); } }
From source file:nl.inl.blacklab.analysis.TestBLDutchAnalyzer.java
License:Apache License
@Test public void testBasics() throws IOException { Reader r = new StringReader("1781 \"hond, a.u.b.: bl(len); \t [pre]cursor \t\nzo'n 'Hij zij' ex-man -"); BLDutchAnalyzer analyzer = new BLDutchAnalyzer(); try {// w w w. j a v a 2 s . c o m TokenStream ts = analyzer.tokenStream("contents", r); try { CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("1781", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("aub", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("bellen", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("precursor", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("hij", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zij", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } } finally { analyzer.close(); } }
From source file:nl.inl.blacklab.analysis.TestBLDutchTokenFilter.java
License:Apache License
@Test public void testBasics() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "hond", "a.u.b.", "bel(len)", "[pre]cursor", "zo'n", "'Hij", "zij'", "ex-man", "-" }); try {/*from w ww. j a v a 2 s . co m*/ ts = new BLDutchTokenFilter(ts); ts.reset(); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("aub", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("bellen", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("precursor", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Hij", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zij", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.analysis.TestBLDutchTokenizer.java
License:Apache License
@Test public void testBasics() throws IOException { Reader r = new StringReader("\"hond, a.u.b.: bl(len); \t [pre]cursor \t\nzo'n 'Hij zij' ex-man -"); TokenStream ts = new BLDutchTokenizer(r); ts.reset();// ww w .ja v a2 s.co m try { CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("a.u.b.", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("bl(len)", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("[pre]cursor", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("'Hij", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zij'", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("-", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.AbstractSynonymFilter.java
License:Apache License
/** * @param args/* ww w .j ava2 s.c o m*/ * @throws IOException */ public static void main(String[] args) throws IOException { TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Dit is een test")); try { ts = new AbstractSynonymFilter(ts) { @Override public String[] getSynonyms(String s) { if (s.equals("test")) return new String[] { "testje" }; if (s.equals("is")) return new String[] { "zijn" }; return null; } }; CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(term.buffer(), 0, term.length())); } } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TestRemoveAllAccentsFilter.java
License:Apache License
@Test public void testRetrieve() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "H", "jij" }); try {/* w w w . ja v a 2s . c o m*/ ts = new RemoveAllAccentsFilter(ts); ts.reset(); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("He", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("jij", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsFilter.java
License:Apache License
@Test public void testRetrieve() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" }); try {//from w w w .j av a 2 s .c o m ts = new TranscribeGermanAccentsFilter(ts); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }