List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String document = value.toString(); document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN .matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll("")); String catMatch = findMatchingCategory(document); if (!"Unknown".equals(catMatch)) { StringBuilder contents = new StringBuilder(1000); TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//w ww . ja v a 2 s .com while (stream.incrementToken()) { contents.append(termAtt.buffer(), 0, termAtt.length()).append(' '); } context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), new Text(contents.toString())); stream.end(); Closeables.close(stream, true); } }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** normal case, unfiltered analyzer */ @Test/*from www. j a v a 2s. com*/ public void testAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); validateTokens(allTokens, ts); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** filtered analyzer */ @Test//from w w w . ja va 2 s.co m public void testNonKeepdAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts); validateTokens(expectedNonKeepTokens, f); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** keep analyzer */ @Test/*from www. j ava 2 s .co m*/ public void testKeepAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts); validateTokens(expectedKeepTokens, f); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** shingles, keep those matching whitelist */ @Test// www .j av a 2 s .com public void testShingleFilteredAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); ShingleFilter sf = new ShingleFilter(ts, 3); TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf); validateTokens(expectedShingleTokens, f); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.regex.AnalyzerTransformer.java
License:Apache License
@Override public String transformMatch(String match) { StringBuilder result = new StringBuilder(); TokenStream ts = null; try {/* w w w .j a v a 2 s. c om*/ ts = analyzer.tokenStream(fieldName, new StringReader(match)); ts.addAttribute(CharTermAttribute.class); ts.reset(); TokenStreamIterator iter = new TokenStreamIterator(ts); while (iter.hasNext()) { result.append(iter.next()).append(' '); } ts.end(); } catch (IOException e) { throw new IllegalStateException(e); } finally { try { Closeables.close(ts, true); } catch (IOException e) { log.error(e.getMessage(), e); } } return result.toString(); }
From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//from w ww . j av a2 s . c o m StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); Closeables.close(stream, true); context.write(key, document); }
From source file:org.apache.maven.index.DefaultQueryCreator.java
License:Apache License
protected int countTerms(final IndexerField indexerField, final String query) { try {/*from ww w. j a v a 2 s . c o m*/ TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query)); ts.reset(); int result = 0; while (ts.incrementToken()) { result++; } ts.end(); ts.close(); return result; } catch (IOException e) { // will not happen return 1; } }
From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java
License:Apache License
/** * Analyzes the given text using the given analyzer and returns the produced tokens. * * @param query The query to analyze. * @param analyzer The analyzer to use./* ww w . j av a2s . c o m*/ */ protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) { TokenStream tokenStream = null; try { tokenStream = analyzer.tokenStream("", query); final Set<BytesRef> tokens = new HashSet<BytesRef>(); final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); tokenStream.reset(); while (tokenStream.incrementToken()) { bytesAtt.fillBytesRef(); tokens.add(BytesRef.deepCopyOf(bytes)); } tokenStream.end(); return tokens; } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } }
From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java
License:Apache License
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */// w ww. ja va2 s. c o m private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { final List<AttributeSource> tokens = new ArrayList<AttributeSource>(); final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); try { tokenStream.reset(); int position = 0; while (tokenStream.incrementToken()) { position += posIncrAtt.getPositionIncrement(); trackerAtt.setActPosition(position); tokens.add(tokenStream.cloneAttributes()); } tokenStream.end(); } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } return tokens; }