List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.apache.mahout.utils.regex.AnalyzerTransformer.java
License:Apache License
@Override public String transformMatch(String match) { StringBuilder result = new StringBuilder(); TokenStream ts = null; try {//from www .ja v a 2s. c o m ts = analyzer.tokenStream(fieldName, new StringReader(match)); ts.addAttribute(CharTermAttribute.class); ts.reset(); TokenStreamIterator iter = new TokenStreamIterator(ts); while (iter.hasNext()) { result.append(iter.next()).append(' '); } ts.end(); } catch (IOException e) { throw new IllegalStateException(e); } finally { try { Closeables.close(ts, true); } catch (IOException e) { log.error(e.getMessage(), e); } } return result.toString(); }
From source file:org.apache.mahout.utils.vectors.text.document.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.termLength() > 0) { document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength())); }/*from ww w. j a v a 2s . c om*/ } context.write(key, document); }
From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//from w w w . j a v a 2s.c om StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); Closeables.close(stream, true); context.write(key, document); }
From source file:org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder.java
License:Apache License
/** * Tokenizes a string using the simplest method. This should be over-ridden for more subtle * tokenization.// w w w.ja v a2s . com */ @Override protected Iterable<String> tokenize(CharSequence originalForm) { try { TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm)); ts.addAttribute(CharTermAttribute.class); return new LuceneTokenIterable(ts, false); } catch (IOException ex) { throw new IllegalStateException(ex); } }
From source file:org.apache.nutch.scoring.similarity.cosine.Model.java
License:Apache License
/** * Used to create a DocVector from given String text. Used during the parse stage of the crawl * cycle to create a DocVector of the currently parsed page from the parseText attribute value * @param content The text to tokenize//from w ww . ja va2 s. c o m * @param mingram Value of mingram for tokenizing * @param maxgram Value of maxgram for tokenizing */ public static DocVector createDocVector(String content, int mingram, int maxgram) { LuceneTokenizer tokenizer; if (mingram > 1 && maxgram > 1) { LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); } else if (mingram > 1) { maxgram = mingram; LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); } else if (stopWords != null) { tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, StemFilterType.PORTERSTEM_FILTER); } else { tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, StemFilterType.PORTERSTEM_FILTER); } TokenStream tStream = tokenizer.getTokenStream(); HashMap<String, Integer> termVector = new HashMap<>(); try { CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class); tStream.reset(); while (tStream.incrementToken()) { String term = charTermAttribute.toString(); LOG.debug(term); if (termVector.containsKey(term)) { int count = termVector.get(term); count++; termVector.put(term, count); } else { termVector.put(term, 1); } } DocVector docVector = new DocVector(); docVector.setTermFreqVector(termVector); return docVector; } catch (IOException e) { LOG.error("Error creating DocVector : {}", StringUtils.stringifyException(e)); } return null; }
From source file:org.apache.roller.weblogger.business.search.IndexUtil.java
License:Apache License
/** * Create a lucene term from the first token of the input string. * /*ww w .j a v a 2s. c o m*/ * @param field * The lucene document field to create a term with * @param input * The input you wish to convert into a term * * @return Lucene search term */ public static Term getTerm(String field, String input) { if (input == null || field == null) { return null; } Analyzer analyzer = IndexManagerImpl.getAnalyzer(); Term term = null; try { TokenStream tokens = analyzer.tokenStream(field, new StringReader(input)); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); tokens.reset(); if (tokens.incrementToken()) { String termt = termAtt.toString(); term = new Term(field, termt); } } catch (IOException e) { // ignored } return term; }
From source file:org.apache.solr.analysis.DoubleMetaphoneFilterFactoryTest.java
License:Apache License
/** * Ensure that reset() removes any state (buffered tokens) *///from ww w . j av a 2s . c o m public void testReset() throws Exception { DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); factory.init(new HashMap<String, String>()); TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); TokenStream filteredStream = factory.create(inputStream); CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); assertTrue(filteredStream.incrementToken()); assertEquals(13, termAtt.length()); assertEquals("international", termAtt.toString()); filteredStream.reset(); // ensure there are no more tokens, such as ANTRNXNL assertFalse(filteredStream.incrementToken()); }
From source file:org.apache.solr.analysis.SlowSynonymFilterFactory.java
License:Apache License
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) { StringReader reader = new StringReader(source); TokenStream ts = loadTokenizer(tokFactory, reader); List<String> tokList = new ArrayList<String>(); try {// w ww .j a va 2 s . co m CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { if (termAtt.length() > 0) tokList.add(termAtt.toString()); } } catch (IOException e) { throw new RuntimeException(e); } finally { reader.close(); } return tokList; }
From source file:org.apache.solr.analysis.TestBufferedTokenStream.java
License:Apache License
public void testReset() throws Exception { final String input = "How now A B brown A cow B like A B thing?"; Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); TokenStream ts = new AB_AAB_Stream(tokenizer); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); assertTrue(ts.incrementToken());//from w w w . jav a2 s .c o m assertEquals("How", term.toString()); assertTrue(ts.incrementToken()); assertEquals("now", term.toString()); assertTrue(ts.incrementToken()); assertEquals("A", term.toString()); // reset back to input, // if reset() does not work correctly then previous buffered tokens will remain tokenizer.reset(new StringReader(input)); ts.reset(); assertTrue(ts.incrementToken()); assertEquals("How", term.toString()); }
From source file:org.apache.solr.analysis.TestCollationKeyFilterFactory.java
License:Apache License
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException { CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class); CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class); assertTrue(stream1.incrementToken()); assertTrue(stream2.incrementToken()); assertEquals(term1.toString(), term2.toString()); assertFalse(stream1.incrementToken()); assertFalse(stream2.incrementToken()); }