List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:com.github.rnewson.couchdb.lucene.util.AnalyzersTest.java
License:Apache License
private String[] analyze(final String analyzerName, final String text) throws Exception { final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName); final TokenStream stream = analyzer.tokenStream("default", new StringReader(text)); stream.reset();/*from w w w . ja va 2s. c om*/ final List<String> result = new ArrayList<String>(); while (stream.incrementToken()) { final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class); result.add(c.toString()); } return result.toArray(new String[0]); }
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); TokenStream ts = analyzer.tokenStream(field, value); ts.reset();// w ww .j a v a 2 s. c o m while (ts.incrementToken()) { CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); String token = new String(termAttribute.buffer(), 0, termAttribute.length()); tokens.add(token); } ts.end(); ts.close(); return tokens; }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string * using GsAnalyzer./* w w w .j av a 2 s .c o m*/ * * @param p_text fuzzy match format string * @return List of c.g.l.tm2.index.Tokens */ public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class); //org.apache.lucene.analysis.Token luceneToken = null; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { // luceneToken = gsAtt.getToken(); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return buildTokenList(tokens); }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string * using GsAnalyzer. This method is suitable for use with TM3 * fuzzy indices, and does two things differently than createGsTokens(): * 1) It returns tokens in the order in which they appear * 2) It does not collapse duplicate tokens (and correspondingly does * not return count information)/*from w w w .j a va 2 s.c o m*/ * * @param p_text fuzzy match format string * @return List of Strings, each representing one token */ public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return tokens; }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
@SuppressWarnings("resource") public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale, false); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset();/*from w w w . ja va 2 s.c o m*/ List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return tokens; }
From source file:com.grantingersoll.intell.index.BayesUpdateRequestProcessor.java
License:Apache License
public String[] tokenizeField(String input) throws IOException { ArrayList<String> tokenList = new ArrayList<String>(256); TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input)); while (ts.incrementToken()) { tokenList.add(ts.getAttribute(CharTermAttribute.class).toString()); }/* w ww .j ava 2 s . c om*/ return tokenList.toArray(new String[tokenList.size()]); }
From source file:com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer.java
License:Open Source License
public void parse(String s) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true);/* w w w . j a va 2 s .c o m*/ DocumentBuilder builder; counts = new HashMap<String, Integer>(); try { builder = factory.newDocumentBuilder(); Document doc = builder.parse(new InputSource(new StringReader(s))); XPathFactory xfactory = XPathFactory.newInstance(); XPath xpath = xfactory.newXPath(); title = xpath.evaluate("//page/title/text()", doc); title = title.replaceAll("\\s", "_"); // title = title.replaceAll("^[^a-zA-Z0-9]", "#"); // title = title.replaceAll("[^a-zA-Z0-9.]", "_"); id = xpath.evaluate("//page/id/text()", doc); String text = xpath.evaluate("//page/revision/text/text()", doc); if (!text.isEmpty()) { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream stream = analyzer.tokenStream(null, new StringReader(text)); while (stream.incrementToken()) { String token = stream.getAttribute(TermAttribute.class).term(); if (dictionary != null && !dictionary.contains(token)) continue; if (counts.containsKey(token)) counts.put(token, counts.get(token) + 1); else counts.put(token, 1); } } } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (XPathExpressionException e) { e.printStackTrace(); } }
From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java
License:Open Source License
private void test(String name, Analyzer a, String text) throws IOException { final Reader r = new StringReader(text); final TokenStream s = a.tokenStream(null, r); List<String> list = Lists.newLinkedList(); s.reset();/* w w w . j a va 2s . c o m*/ while (s.incrementToken()) { if (s.hasAttribute(CharTermAttribute.class)) { list.add(s.getAttribute(CharTermAttribute.class).toString()); } } System.out.printf("[%s] %s => %s\n", name, text, list); }
From source file:com.isotrol.impe3.lucene.PrefixAnalyzedQueryParser.java
License:Open Source License
@Override protected org.apache.lucene.search.Query getPrefixQuery(String field, String termStr) throws ParseException { try {/*from w ww. ja va2 s. c o m*/ TokenStream ts = analyzer.tokenStream(field, new StringReader(termStr)); if (ts.incrementToken() && ts.hasAttribute(CharTermAttribute.class)) { String term = ts.getAttribute(CharTermAttribute.class).toString(); if (term != null) { return super.getPrefixQuery(field, term); } } } catch (IOException e) { } return super.getPrefixQuery(field, termStr); }
From source file:com.lou.simhasher.seg.WordsSegment.java
License:Open Source License
/** * ?//ww w.j av a2 s .com * * @param str * @return */ public static List<String> getCutWords(String str) { Analyzer analyzer = new IKAnalyzer(); Reader r = new StringReader(str); TokenStream ts = analyzer.tokenStream("searchValue", r); ts.addAttribute(CharTermAttribute.class); List<String> list = new ArrayList<String>(); try { while (ts.incrementToken()) { CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class); String word = ta.toString(); list.add(word); } } catch (IOException e) { logger.error("?IO" + e.getMessage()); } return list; }