List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsSynonymFilter.java
License:Apache License
@Test public void testRetrieve() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" }); try {/*from www . j a v a 2 s . co m*/ ts = new TranscribeGermanAccentsSynonymFilter(ts); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Kln", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsFilter.java
License:Apache License
/** * Test program// w ww . ja v a 2s.com * @param args * @throws IOException */ public static void main(String[] args) throws IOException { TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Aachen Dsseldorf Kln Berlin sterreich")); try { ts = new TranscribeGermanAccentsFilter(ts); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(term.buffer(), 0, term.length())); } } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsSynonymFilter.java
License:Apache License
public static void main(String[] args) throws IOException { TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Aachen Dsseldorf Kln Berlin sterreich")); try {/*from w w w .j a v a 2 s. com*/ ts = new TranscribeGermanAccentsSynonymFilter(ts); ts.reset(); ts = new RemoveAllAccentsFilter(ts); ts.reset(); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(term.buffer(), 0, term.length())); } } finally { ts.close(); } }
From source file:nl.inl.blacklab.index.BLDefaultAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String TEST_STR = "H jij ! ? ?. ]' ??. "; Analyzer a = new BLDefaultAnalyzer(); try {//from w w w . j av a2 s . co m TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR)); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"), new StringReader(TEST_STR)); ta = ts2.addAttribute(CharTermAttribute.class); while (ts2.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } } finally { a.close(); } }
From source file:nl.inl.blacklab.index.complex.TokenStreamFromList.java
License:Apache License
public static void main(String[] args) throws IOException { TokenStream s = new TokenStreamFromList(Arrays.asList("a", "b", "c"), Arrays.asList(1, 1, 1)); try {//w w w.ja v a2s.c om CharTermAttribute term = s.addAttribute(CharTermAttribute.class); s.incrementToken(); System.out.println(new String(term.buffer(), 0, term.length())); s.incrementToken(); System.out.println(new String(term.buffer(), 0, term.length())); s.incrementToken(); System.out.println(new String(term.buffer(), 0, term.length())); System.out.println(s.incrementToken()); } finally { s.close(); } }
From source file:nl.uva.sne.commons.SemanticUtils.java
public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException { text = text.replaceAll("", "'"); text = text.replaceAll("_", " "); text = text.replaceAll("[0-9]", ""); text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " "); text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", ""); text = text.toLowerCase();// w w w . j a v a 2s . co m TokenStream tokenStream; if (stem) { tokenStream = tokenStemStream("field", new StringReader(text)); } else { tokenStream = tokenStream("field", new StringReader(text)); } ArrayList<String> words = new ArrayList<>(); try { CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { words.add(term.toString()); } tokenStream.end(); } finally { tokenStream.close(); } // Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens"); return words; }
From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java
License:Open Source License
/** * Check that the TokenStream yields the exact tokens specified. * Note that order is not checked, since the map of locales will not provide a * predictable ordering when enumerated. * /*from ww w . ja v a 2 s.c o m*/ * The expected list of tokens may contain the same token more than once and * the number of instances will have to match the number found in the stream. * * @param ts TokenStream to inspect. * @param expectedTokens List of tokens in the order expected from the stream. * @throws IOException */ private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException { final int expectedCount = expectedTokens.size(); int count = 0; CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { count++; System.out.println("Token: " + termAtt.toString()); if (expectedTokens.contains(termAtt.toString())) { // remove an instance of the term text so that it is not matched again expectedTokens.remove(termAtt.toString()); } else { fail("Unexpected token: " + termAtt.toString()); } } ts.end(); } finally { ts.close(); } assertEquals("Incorrect number of tokens generated.", expectedCount, count); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java
License:Open Source License
private void tokenise(TokenStream ts, String[] tokens) throws IOException { int i = 0;//from w ww. ja va2 s. c o m CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println("token: " + ts.reflectAsString(true)); String termText = termAtt.toString(); if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) { assert (i % 2 == 1); assertEquals(termText, tokens[i++]); } } ts.end(); } finally { ts.close(); } if (i != tokens.length) { fail("Invalid number of tokens, found " + i + " and expected " + tokens.length); } }
From source file:org.apache.jackrabbit.core.query.lucene.AbstractIndex.java
License:Apache License
/** * Returns a document that is finished with text extraction and is ready to * be added to the index.// w ww. j a va2s . com * * @param doc the document to check. * @return <code>doc</code> if it is finished already or a stripped down * copy of <code>doc</code> without text extractors. * @throws IOException if the document cannot be added to the indexing * queue. */ private Document getFinishedDocument(Document doc) throws IOException { if (!Util.isDocumentReady(doc)) { Document copy = new Document(); // mark the document that reindexing is required copy.add(new Field(FieldNames.REINDEXING_REQUIRED, false, "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); for (Fieldable f : doc.getFields()) { Fieldable field = null; Field.TermVector tv = getTermVectorParameter(f); Field.Store stored = f.isStored() ? Field.Store.YES : Field.Store.NO; Field.Index indexed = getIndexParameter(f); if (f instanceof LazyTextExtractorField || f.readerValue() != null) { // replace all readers with empty string reader field = new Field(f.name(), new StringReader(""), tv); } else if (f.stringValue() != null) { field = new Field(f.name(), false, f.stringValue(), stored, indexed, tv); } else if (f.isBinary()) { field = new Field(f.name(), f.getBinaryValue(), stored); } else if (f.tokenStreamValue() != null && f.tokenStreamValue() instanceof SingletonTokenStream) { TokenStream tokenStream = f.tokenStreamValue(); TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); tokenStream.reset(); field = new Field(f.name(), new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone())); } if (field != null) { field.setOmitNorms(f.getOmitNorms()); copy.add(field); } } // schedule the original document for later indexing Document existing = indexingQueue.addDocument(doc); if (existing != null) { // the queue already contained a pending document for this // node. -> dispose the document Util.disposeDocument(existing); } // use the stripped down copy for now doc = copy; } return doc; }
From source file:org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser.java
License:Apache License
/** * {@inheritDoc}//w ww . j av a 2s .com */ protected Query getPrefixQuery(String field, String termStr) throws ParseException { // only create a prefix query when the term is a single word / token Analyzer a = getAnalyzer(); TokenStream ts = a.tokenStream(field, new StringReader(termStr)); int count = 0; boolean isCJ = false; try { TypeAttribute t = ts.addAttribute(TypeAttribute.class); ts.reset(); while (ts.incrementToken()) { count++; isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type()); } ts.end(); } catch (IOException e) { throw new ParseException(e.getMessage()); } finally { try { ts.close(); } catch (IOException e) { // ignore } } if (count > 1 && isCJ) { return getFieldQuery(field, termStr); } else { return getWildcardQuery(field, termStr + "*"); } }