List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
@SuppressWarnings("resource") public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale, false); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset();/*from ww w . j a va 2 s . co m*/ List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return tokens; }
From source file:com.globalsight.ling.tm2.lucene.TuvDocument.java
License:Apache License
private int getTotalTokenCount(String text, Analyzer analyzer) throws Exception { TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(text)); tokenStream.reset();/*from w w w.j a va2 s .c o m*/ int tokenCount = 0; while (tokenStream.incrementToken()) { tokenCount++; } return tokenCount; }
From source file:com.grantingersoll.intell.index.BayesUpdateRequestProcessor.java
License:Apache License
public String[] tokenizeField(String input) throws IOException { ArrayList<String> tokenList = new ArrayList<String>(256); TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input)); while (ts.incrementToken()) { tokenList.add(ts.getAttribute(CharTermAttribute.class).toString()); }//from w w w. j av a 2 s . com return tokenList.toArray(new String[tokenList.size()]); }
From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java
License:Open Source License
private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException { // Get stemmed question SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER); EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer(); TokenStream ts = ea.tokenStream("field", question); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();/*w w w .j av a 2s. co m*/ List<String> stemmedQuestion = new ArrayList<String>(); while (ts.incrementToken()) stemmedQuestion.add(charTermAttribute.toString()); // get query terms BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses(); SpanQuery[] queries; String term1, term2; List<String> unigrams = new ArrayList<String>(); int numFields = clauses.length / (2 * stemmedQuestion.size() - 1); // test bigrams int bigramidx = 0; for (int idx = 0; idx < clauses.length; idx++) { Query q = clauses[idx].getQuery(); if (q instanceof SpanNearQuery) { queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = bigramidx / numFields; term1 = ((SpanTermQuery) queries[0]).getTerm().text(); term2 = ((SpanTermQuery) queries[1]).getTerm().text(); assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); bigramidx++; } else if (q instanceof TermQuery) { unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text()); } else { assertTrue("Unknown type of query found!", false); } } // test unigrams for (String s : unigrams) assertTrue(stemmedQuestion.contains(s)); for (String s : stemmedQuestion) assertTrue(unigrams.contains(s)); }
From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanTrigramQueryGeneratorTest.java
License:Open Source License
private void test_that_generated_trigram_query_match_the_referenced_query() throws IOException { Set<Term> queryTerms = new HashSet<Term>(); // Get stemmed question SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER); EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer(); TokenStream ts = ea.tokenStream("field", question); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();//w w w . j a va2s . c om List<String> stemmedQuestion = new ArrayList<String>(); while (ts.incrementToken()) stemmedQuestion.add(charTermAttribute.toString()); // get query terms spanTrigramQuery.extractTerms(queryTerms); BooleanClause[] clauses = ((BooleanQuery) spanTrigramQuery).getClauses(); SpanQuery[] qs; String term1, term2, term3; int numFields = clauses.length / (3 * stemmedQuestion.size() - 3); List<String> unigrams = new ArrayList<String>(); int idx = 0; // test trigrams int trigramidx = 0; for (idx = clauses.length - numFields * (stemmedQuestion.size() - 2); idx < clauses.length; idx++) { qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = trigramidx / numFields; term1 = ((SpanTermQuery) qs[0]).getTerm().text(); term2 = ((SpanTermQuery) qs[1]).getTerm().text(); term3 = ((SpanTermQuery) qs[2]).getTerm().text(); assertEquals("Extracted first term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); assertEquals("Extracted third term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 2), term3); trigramidx++; } // test bigrams int bigramidx = 0; for (idx = 0; idx < (2 * stemmedQuestion.size() - 1) * numFields; idx++) { Query q = clauses[idx].getQuery(); if (q instanceof SpanNearQuery) { qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = bigramidx / numFields; term1 = ((SpanTermQuery) qs[0]).getTerm().text(); term2 = ((SpanTermQuery) qs[1]).getTerm().text(); assertEquals("Extracted first term in the bigram doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term in the bigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); bigramidx++; } else if (q instanceof TermQuery) { unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text()); } else { assertTrue("Unknown type of query found!", false); } } // test unigrams for (String s : unigrams) assertTrue(stemmedQuestion.contains(s)); for (String s : stemmedQuestion) assertTrue(unigrams.contains(s)); }
From source file:com.ikon.analysis.AnalyzerDemo.java
License:Open Source License
/** * Analyze and display tokens//w w w .ja v a 2 s .co m */ private static void analyze(String string, Analyzer analyzer) throws IOException { StringBuffer buffer = new StringBuffer(); TokenStream stream = analyzer.tokenStream("contents", new StringReader(string)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); buffer.append(analyzer.getClass().getName()); buffer.append(" -> "); while (stream.incrementToken()) { buffer.append(" ["); buffer.append(term.toString()); buffer.append("]"); } String output = buffer.toString(); log.info(output); }
From source file:com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer.java
License:Open Source License
public void parse(String s) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true);/* w w w . j av a 2 s.com*/ DocumentBuilder builder; counts = new HashMap<String, Integer>(); try { builder = factory.newDocumentBuilder(); Document doc = builder.parse(new InputSource(new StringReader(s))); XPathFactory xfactory = XPathFactory.newInstance(); XPath xpath = xfactory.newXPath(); title = xpath.evaluate("//page/title/text()", doc); title = title.replaceAll("\\s", "_"); // title = title.replaceAll("^[^a-zA-Z0-9]", "#"); // title = title.replaceAll("[^a-zA-Z0-9.]", "_"); id = xpath.evaluate("//page/id/text()", doc); String text = xpath.evaluate("//page/revision/text/text()", doc); if (!text.isEmpty()) { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream stream = analyzer.tokenStream(null, new StringReader(text)); while (stream.incrementToken()) { String token = stream.getAttribute(TermAttribute.class).term(); if (dictionary != null && !dictionary.contains(token)) continue; if (counts.containsKey(token)) counts.put(token, counts.get(token) + 1); else counts.put(token, 1); } } } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (XPathExpressionException e) { e.printStackTrace(); } }
From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java
License:Open Source License
private void test(String name, Analyzer a, String text) throws IOException { final Reader r = new StringReader(text); final TokenStream s = a.tokenStream(null, r); List<String> list = Lists.newLinkedList(); s.reset();/*w w w .j av a 2 s.co m*/ while (s.incrementToken()) { if (s.hasAttribute(CharTermAttribute.class)) { list.add(s.getAttribute(CharTermAttribute.class).toString()); } } System.out.printf("[%s] %s => %s\n", name, text, list); }
From source file:com.isotrol.impe3.lucene.PrefixAnalyzedQueryParser.java
License:Open Source License
@Override protected org.apache.lucene.search.Query getPrefixQuery(String field, String termStr) throws ParseException { try {/*from w w w. j ava 2 s .co m*/ TokenStream ts = analyzer.tokenStream(field, new StringReader(termStr)); if (ts.incrementToken() && ts.hasAttribute(CharTermAttribute.class)) { String term = ts.getAttribute(CharTermAttribute.class).toString(); if (term != null) { return super.getPrefixQuery(field, term); } } } catch (IOException e) { } return super.getPrefixQuery(field, termStr); }
From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java
License:Open Source License
public void justTokenize(String text, List<TokenTerm> tokenTerms) throws IOException { StringReader reader = new StringReader(text); TokenStream ts1 = tokenizer.create(reader); TokenStream ts2 = new TokenTermPopulateFilter(tokenTerms, ts1); try {//from w w w . j a va2 s .c om while (ts2.incrementToken()) ; } finally { IOUtils.close(ts2, ts1); } }