List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** normal case, unfiltered analyzer */ @Test//w ww . ja v a 2 s . c o m public void testAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); validateTokens(allTokens, ts); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** filtered analyzer */ @Test/*from w w w . j a va 2 s .co m*/ public void testNonKeepdAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts); validateTokens(expectedNonKeepTokens, f); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** keep analyzer */ @Test//from w w w . j a v a2s .co m public void testKeepAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts); validateTokens(expectedKeepTokens, f); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** shingles, keep those matching whitelist */ @Test/*from w w w . j a v a 2 s. c om*/ public void testShingleFilteredAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); ShingleFilter sf = new ShingleFilter(ts, 3); TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf); validateTokens(expectedShingleTokens, f); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.regex.AnalyzerTransformer.java
License:Apache License
@Override public String transformMatch(String match) { StringBuilder result = new StringBuilder(); TokenStream ts = null; try {// w w w. j av a2 s . c om ts = analyzer.tokenStream(fieldName, new StringReader(match)); ts.addAttribute(CharTermAttribute.class); ts.reset(); TokenStreamIterator iter = new TokenStreamIterator(ts); while (iter.hasNext()) { result.append(iter.next()).append(' '); } ts.end(); } catch (IOException e) { throw new IllegalStateException(e); } finally { try { Closeables.close(ts, true); } catch (IOException e) { log.error(e.getMessage(), e); } } return result.toString(); }
From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); }//from w w w. j a v a2s. c o m } stream.end(); Closeables.close(stream, true); context.write(key, document); }
From source file:org.apache.maven.index.DefaultQueryCreator.java
License:Apache License
protected int countTerms(final IndexerField indexerField, final String query) { try {//from w ww. java2 s . co m TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query)); ts.reset(); int result = 0; while (ts.incrementToken()) { result++; } ts.end(); ts.close(); return result; } catch (IOException e) { // will not happen return 1; } }
From source file:org.apache.nutch.scoring.similarity.cosine.Model.java
License:Apache License
/** * Used to create a DocVector from given String text. Used during the parse stage of the crawl * cycle to create a DocVector of the currently parsed page from the parseText attribute value * @param content The text to tokenize//from ww w . j a v a 2 s . com * @param mingram Value of mingram for tokenizing * @param maxgram Value of maxgram for tokenizing */ public static DocVector createDocVector(String content, int mingram, int maxgram) { LuceneTokenizer tokenizer; if (mingram > 1 && maxgram > 1) { LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); } else if (mingram > 1) { maxgram = mingram; LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); } else if (stopWords != null) { tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, StemFilterType.PORTERSTEM_FILTER); } else { tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, StemFilterType.PORTERSTEM_FILTER); } TokenStream tStream = tokenizer.getTokenStream(); HashMap<String, Integer> termVector = new HashMap<>(); try { CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class); tStream.reset(); while (tStream.incrementToken()) { String term = charTermAttribute.toString(); LOG.debug(term); if (termVector.containsKey(term)) { int count = termVector.get(term); count++; termVector.put(term, count); } else { termVector.put(term, 1); } } DocVector docVector = new DocVector(); docVector.setTermFreqVector(termVector); return docVector; } catch (IOException e) { LOG.error("Error creating DocVector : {}", StringUtils.stringifyException(e)); } return null; }
From source file:org.apache.roller.weblogger.business.search.IndexUtil.java
License:Apache License
/** * Create a lucene term from the first token of the input string. * // w w w . j a v a 2s .c o m * @param field * The lucene document field to create a term with * @param input * The input you wish to convert into a term * * @return Lucene search term */ public static Term getTerm(String field, String input) { if (input == null || field == null) { return null; } Analyzer analyzer = IndexManagerImpl.getAnalyzer(); Term term = null; try { TokenStream tokens = analyzer.tokenStream(field, new StringReader(input)); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); tokens.reset(); if (tokens.incrementToken()) { String termt = termAtt.toString(); term = new Term(field, termt); } } catch (IOException e) { // ignored } return term; }
From source file:org.apache.solr.analysis.DoubleMetaphoneFilterFactoryTest.java
License:Apache License
/** * Ensure that reset() removes any state (buffered tokens) *//*from www . j a va 2 s . c o m*/ public void testReset() throws Exception { DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); factory.init(new HashMap<String, String>()); TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); TokenStream filteredStream = factory.create(inputStream); CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); assertTrue(filteredStream.incrementToken()); assertEquals(13, termAtt.length()); assertEquals("international", termAtt.toString()); filteredStream.reset(); // ensure there are no more tokens, such as ANTRNXNL assertFalse(filteredStream.incrementToken()); }