List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java
License:Apache License
public String clean(String spot) throws IOException { try {/*from www. j av a 2s .com*/ spot = URLDecoder.decode(spot, "UTF-8"); } catch (IllegalArgumentException e) { } analyzer.lowercase(spot.length() > 4); TokenStream ts = analyzer.tokenStream("content", new StringReader(spot)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); sb.setLength(0); int tokens = 0; while (ts.incrementToken()) { tokens++; sb.append(termAtt.toString()); sb.append(' '); if (tokens > maxSpotLength) { return ""; } } ts.end(); ts.reset(); if (sb.length() > 0) sb.setLength(sb.length() - 1); // System.out.println(spot + " -> " + "[" + sb.toString() + "]"); String finalSpot = sb.toString(); for (Filter<String> filter : filters) { if (filter.isFilter(finalSpot)) { finalSpot = ""; } } return finalSpot; }
From source file:it.cnr.isti.hpc.dexter.spot.DocumentFrequencyGenerator.java
License:Apache License
private void initBloomFilter(Iterator<String> spotIterator) { String spot = spotIterator.next(); analyzer.setShingles(false);//from ww w .jav a2s . c o m ProgressLogger pl = new ProgressLogger("added {} spots to the bloom filter", 100000); pl.up(); while (spotIterator.hasNext()) { String next = spotIterator.next(); if (next.equals(spot)) continue; pl.up(); spot = next; TokenStream ts = null; try { ts = analyzer.tokenStream("content", new StringReader(spot)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); if (ts.incrementToken()) { spot = termAtt.toString(); bf.add(spot); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
From source file:it.unibz.instasearch.indexing.StorageIndexer.java
License:Open Source License
/** * Extracts terms from text//w ww.j av a 2 s. co m * * @param text * @return a map of terms to their offsets in text * @throws IOException */ public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException { Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>(); TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text)); TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class); while (tokenStream.incrementToken()) { String termText = termAtt.term().toLowerCase();// t.termText().toLowerCase(); int offset = offsetAtt.startOffset(); List<Integer> offsets = terms.get(termText); if (offsets == null) { offsets = new LinkedList<Integer>(); terms.put(termText, offsets); } offsets.add(offset); } tokenStream.close(); return terms; }
From source file:ivory.core.tokenize.Tokenizer.java
License:Apache License
/** * Convert tokenStream object into a string. * /*from w w w.ja v a2 s .c o m*/ * @param tokenStream * object returned by Lucene tokenizer * @return * String corresponding to the tokens output by tokenStream */ protected static String streamToString(TokenStream tokenStream) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.clearAttributes(); StringBuilder tokenized = new StringBuilder(); try { while (tokenStream.incrementToken()) { tokenized.append(termAtt.toString() + " "); } } catch (IOException e) { e.printStackTrace(); } return tokenized.toString().trim(); }
From source file:jaligner.Sequence.java
License:Open Source License
/** * Constructor/* www . j ava 2 s . com*/ * * @param sequence */ public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException { super(); this.sequence = sequence; TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence)); Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory( stream.getAttributeFactory()); Vector<Token> tokenVector = new Vector<Token>(); while (stream.incrementToken() && tokenVector.size() < max_length) { // Token token = new Token(); // Token token = (Token) stream.getAttribute(CharTermAttribute.class); Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class); CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class); // PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class); // FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class); // public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset()); token.setOffset(offset.startOffset(), offset.endOffset()); // token.setPayload(payload.getPayload()); // token.setFlags(flags.getFlags()); if (stream.hasAttribute(PositionIncrementAttribute.class)) { PositionIncrementAttribute positionIncrement = stream .getAttribute(PositionIncrementAttribute.class); token.setPositionIncrement(positionIncrement.getPositionIncrement()); } if (stream.hasAttribute(TypeAttribute.class)) { TypeAttribute type = stream.getAttribute(TypeAttribute.class); token.setType(type.type()); } tokenVector.add(token); } stream.end(); stream.close(); this.tokens = tokenVector.toArray(new Token[tokenVector.size()]); }
From source file:jobs.LoadOntologyJob.java
private int getTotalLength(String label) throws IOException { //Analyzer doesn't remomve stop words Analyzer customanalyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47); List<String> resultStop = new ArrayList<String>(); TokenStream customstream = customanalyzer.tokenStream(null, new StringReader(label)); customstream.reset();/*from w ww .ja v a 2 s . c o m*/ while (customstream.incrementToken()) { resultStop.add(customstream.getAttribute(CharTermAttribute.class).toString()); } return resultStop.size(); }
From source file:jobs.LoadOntologyJob.java
private int getLengthWithoutStopWords(String label) throws IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); List<String> result = new ArrayList<String>(); TokenStream stream = analyzer.tokenStream(null, new StringReader(label)); stream.reset();// w w w . j a va2 s . c o m while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } return result.size(); }
From source file:jp.co.atware.solr.analizers.cjk.CJKBigramFilterTest.java
License:Apache License
@Theory public void testIncrementToken(Fixture testData) throws Exception { TokenStream tokenStream = getTokenStream(testData.input); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); List<String> actual = new ArrayList<String>(); while (tokenStream.incrementToken()) { actual.add(termAtt.toString());/* www . ja va2s . co m*/ } assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected)); }
From source file:jp.co.atware.solr.analizers.cjk.CranioCaudalFilterTest.java
License:Apache License
@Theory public void testIncrementToken(TestData testData) throws Exception { TokenStream tokenStream = createTokenStream(testData.input); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); List<String> actual = new ArrayList<String>(); while (tokenStream.incrementToken()) { actual.add(termAtt.toString());/* w ww.ja v a 2 s .c o m*/ } assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected)); }
From source file:jp.co.atware.solr.analizers.cjk.MultistageMappingCharFilterTest.java
License:Apache License
@Theory public void testMultiMappingAndOffset(TestData testData) throws Exception { Reader reader = charFilterFactory.create(new StringReader(testData.input)); TokenStream tokenStream = tokenizerFactory.create(reader); OffsetAttribute actualOffset = tokenStream.getAttribute(OffsetAttribute.class); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset();// ww w . j ava 2 s .c o m assertThat(tokenStream.incrementToken(), is(true)); assertThat(termAtt.toString(), is(testData.expected)); assertThat(actualOffset.startOffset(), is(testData.start)); assertThat(actualOffset.endOffset(), is(testData.end)); assertThat(tokenStream.incrementToken(), is(false)); }