List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java
License:Open Source License
private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException { TokenStream ts = analyzer.tokenStream(fld, new StringReader(val)); try {/*from w ww . ja va 2s . c o m*/ ts.reset(); // Iterate over tokens and treat each token as term int pos = 0; while (ts.incrementToken()) { CharTermAttribute t = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class); pos += p.getPositionIncrement(); phrase.add(new Term(fld, t.toString()), pos - 1); } // End-of-stream clean-up ts.end(); } finally { ts.close(); } }
From source file:net.sf.mmm.search.engine.impl.lucene.LuceneFieldManagerImpl.java
License:Apache License
/** * {@inheritDoc}// ww w .j av a2 s .co m */ @Override public Term createTerm(String field, Object value) { NlsNullPointerException.checkNotNull("field", field); NlsNullPointerException.checkNotNull("value", value); String normalizedValue; SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration(field); SearchFieldType fieldType = fieldConfiguration.getType(); boolean isString = (value instanceof String); try { switch (fieldType) { case TEXT: try { TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader((String) value)); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); if (tokenStream.incrementToken()) { normalizedValue = termAttribute.term(); } else { normalizedValue = ""; } } catch (IOException e) { throw new RuntimeIoException(e, IoMode.READ); } break; case STRING: normalizedValue = (String) value; break; case INTEGER: int i; if (isString) { i = Integer.parseInt((String) value); } else { i = ((Integer) value).intValue(); } normalizedValue = NumericUtils.intToPrefixCoded(i); break; case LONG: long l; if (isString) { l = Long.parseLong((String) value); } else { l = ((Long) value).longValue(); } normalizedValue = NumericUtils.longToPrefixCoded(l); break; case FLOAT: float f; if (isString) { f = Float.parseFloat((String) value); } else { f = ((Float) value).floatValue(); } normalizedValue = NumericUtils.floatToPrefixCoded(f); break; case DOUBLE: double d; if (isString) { d = Double.parseDouble((String) value); } else { d = ((Double) value).doubleValue(); } normalizedValue = NumericUtils.doubleToPrefixCoded(d); break; case DATE: Date date; if (isString) { date = this.iso8601Util.parseDate((String) value); } else { date = (Date) value; } normalizedValue = NumericUtils.longToPrefixCoded(date.getTime()); break; default: throw new IllegalCaseException(SearchFieldType.class, fieldType); } } catch (ClassCastException e) { throw new NlsClassCastException(e, value, fieldType.getFieldClass()); } return new Term(field, normalizedValue); }
From source file:net.sf.mmm.search.engine.impl.lucene.LuceneFieldManagerImpl.java
License:Apache License
/** * {@inheritDoc}/* ww w . java 2s . c om*/ */ @Override public Query createPhraseQuery(String field, String value) { NlsNullPointerException.checkNotNull("field", field); NlsNullPointerException.checkNotNull("value", value); SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration(field); SearchFieldType fieldType = fieldConfiguration.getType(); Query result; if (fieldType == SearchFieldType.TEXT) { PhraseQuery phraseQuery = new PhraseQuery(); result = phraseQuery; try { TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader(value)); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { phraseQuery.add(new Term(field, termAttribute.term())); } } catch (IOException e) { throw new RuntimeIoException(e, IoMode.READ); } } else { result = new TermQuery(createTerm(field, value)); } return result; }
From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java
public void testNextToken1() throws Exception { ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null); TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG1)); TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR1)); boolean hasMore = ts1.incrementToken(); ts2.incrementToken();//www . j a v a2s. co m TermAttribute t1 = (TermAttribute) ts1 .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); TermAttribute t2 = (TermAttribute) ts2 .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); while (hasMore) { assertEquals(new String(t1.termBuffer(), 0, t1.termLength()), new String(t2.termBuffer(), 0, t2.termLength())); hasMore = ts1.incrementToken(); ts2.incrementToken(); t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); } }
From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java
public void testNextToken2() throws Exception { ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null); TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG2)); TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR2)); boolean hasMore = ts1.incrementToken(); ts2.incrementToken();// w w w .ja v a 2 s. c o m TermAttribute t1 = (TermAttribute) ts1 .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); TermAttribute t2 = (TermAttribute) ts2 .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); while (hasMore) { assertEquals(new String(t1.termBuffer(), 0, t1.termLength()), new String(t2.termBuffer(), 0, t2.termLength())); hasMore = ts1.incrementToken(); ts2.incrementToken(); t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); } }
From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java
License:Apache License
@Override public String[] getQueryTokens(final String queryString) { TokenStream tokenStream = null; try {/*from ww w. j a v a 2 s. c o m*/ tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString)); tokenStream.reset(); final ArrayList<String> al = new ArrayList<>(); while (tokenStream.incrementToken()) { final String term = tokenStream.getAttribute(CharTermAttribute.class).toString(); if (term != null && term.length() > 1) { al.add(term); } } if (al.size() == 0) { al.add(queryString); } return al.toArray(new String[al.size()]); } catch (final IOException e) { throw ADOException.of(e); } finally { if (tokenStream != null) { try { tokenStream.close(); } catch (final IOException e) { } } } }
From source file:net.skyatlas.icd.dao.daoImpl.AnsjAnalysisTest.java
@Test public void test() throws IOException { Token nt = new Token(); Analyzer ca = new AnsjAnalysis(); Reader sentence = new StringReader( "\n\n\n\n\n\n\n????, ????????????????????????????" + "???????????????????" + "??????????? ??????????????2????" + "" + "? ????????????? ??? ????????"); TokenStream ts = ca.tokenStream("sentence", sentence); System.out.println("start: " + (new Date())); long before = System.currentTimeMillis(); while (ts.incrementToken()) { System.out.println(ts.getAttribute(CharTermAttribute.class)); }/*from w w w. ja va2 s .co m*/ ts.close(); long now = System.currentTimeMillis(); System.out.println("time: " + (now - before) / 1000.0 + " s"); }
From source file:net.skyatlas.icd.test.AnsegTest.java
static public void main(String[] args) throws IOException, CorruptIndexException, ParseException, InvalidTokenOffsetsException { AnsegTest inst = new AnsegTest(); Token nt = new Token(); Analyzer ca = new AnsjAnalysis(); Reader sentence = new StringReader( "\n\n\n\n\n\n\n????, ????????????????????????????" + "???????????????????" + "??????????? ??????????????2????" + "" + "? ????????????? ??? ????????"); TokenStream ts = ca.tokenStream("sentence", sentence); System.out.println("start: " + (new Date())); long before = System.currentTimeMillis(); while (ts.incrementToken()) { System.out.println(ts.getAttribute(CharTermAttribute.class)); }//from w ww. j a va2 s .c om ts.close(); long now = System.currentTimeMillis(); System.out.println("time: " + (now - before) / 1000.0 + " s"); HashSet<String> hs = new HashSet<String>(); BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"), "UTF-8"); String word = null; while ((word = reader2.readLine()) != null) { hs.add(word); } Analyzer analyzer = new AnsjAnalysis(hs, false); Directory directory = null; IndexWriter iwriter = null; BufferedReader reader = IOUtil.getReader("/Users/changzhenghe/Downloads/hy_statspack01.txt", "UTF-8"); String temp = null; StringBuilder sb = new StringBuilder(); while ((temp = reader.readLine()) != null) { sb.append(temp); sb.append("\n"); } reader.close(); String text = sb.toString(); text = "???????????? ??? ????????"; IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer); // directory = new RAMDirectory(); iwriter = new IndexWriter(directory, ic); // BufferedReader reader = // IOUtil.getReader("/Users/ansj/Documents//?//1998?_.txt", // "GBK"); // String temp = null; // while ((temp = reader.readLine()) != null) { // addContent(iwriter, temp); // } inst.addContent(iwriter, "? ?() (?)"); inst.addContent(iwriter, " ?() (?)"); inst.addContent(iwriter, "? ? (?)"); inst.addContent(iwriter, " ??NEC "); inst.addContent(iwriter, "?"); iwriter.commit(); iwriter.close(); System.out.println(""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, "?"); /* KeyWordComputer kwc = new KeyWordComputer(5); String title = "??"; String content = "9??" + "?????????" + "????" + "??" + "?????" + "???" + "??????" + "???" + "????20??" + "????" + "?" + "???]??" + "???"; Collection<Keyword> result = kwc.computeArticleTfidf(title, content); System.out.println(result); AnsegTest t = new AnsegTest(); List<Term> parse = ToAnalysis.parse("?"); System.out.println(parse); System.out.println("*********** ? ************"); // UserDefineLibrary.insertWord("", "userDefine", 1000); // UserDefineLibrary.insertWord("?", "userDefine", 1000); UserDefineLibrary.insertWord("?", "userDefine", 1000); parse = ToAnalysis.parse("???"); System.out.println(parse); */ }
From source file:nl.uva.lucenefacility.LuceneUtil.java
public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try {//w w w . j a va 2s. c om TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:org.aksw.palmetto.corpus.lucene.SimpleAnalyzerTest.java
License:Open Source License
public void test(boolean lowercase) throws Exception { SimpleAnalyzer analyzer = new SimpleAnalyzer(lowercase); TokenStream stream = analyzer.tokenStream("test", text); CharTermAttribute token;//w w w . jav a2 s. com int count = 0; stream.reset(); while (stream.incrementToken()) { Assert.assertTrue(count < expectedTokens.length); token = stream.getAttribute(CharTermAttribute.class); if (lowercase) { Assert.assertEquals(expectedTokens[count].toLowerCase(), token.toString()); } else { Assert.assertEquals(expectedTokens[count], token.toString()); } ++count; } Assert.assertEquals(expectedTokens.length, count); analyzer.close(); }