List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java
License:Open Source License
public static String analyze(String str, Analyzer analyzer) throws IOException { if (analyzer == null) { return str; }/*www.j av a 2 s. c o m*/ StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(str)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } return norm.toString(); }
From source file:com.doculibre.constellio.utils.AnalyzerUtils.java
License:Open Source License
public static String analyzePhrase(String phrase, boolean useStopWords) { if (StringUtils.isNotBlank(phrase)) { String analysedPhrase;// www.j av a 2s . c o m Analyzer analyzer = getDefaultAnalyzer(useStopWords); StringBuilder norm = new StringBuilder(); TokenStream tokens; try { tokens = analyzer.tokenStream("", new StringReader(phrase)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } analysedPhrase = norm.toString().trim(); } catch (IOException e) { throw new RuntimeException(e); } return analysedPhrase; } else { return phrase; } }
From source file:com.faqit.similarity.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. Can handle ngrams of any length * and also perform stop word removal before extraction * //from www. ja v a2s .com * @param text * the text that the ngrams should be extracted from * @param length * the length of the ngrams * @param stopWords * whether or not stopwords should be removed before extraction * @param overlap * whether or not the ngrams should overlap */ public void extract(String text, int length, Boolean stopWords, Boolean overlap) throws FileNotFoundException, IOException { this.text = text; this.length = length; this.stopWords = stopWords; this.overlap = overlap; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* * If the minLength and maxLength are both 1, then we want unigrams Make * use of a StopAnalyzer when stopwords should be removed Make use of a * SimpleAnalyzer when stop words should be included */ if (length == 1) { if (this.stopWords) { analyzer = new StandardAnalyzer(); } else { analyzer = new SimpleAnalyzer(); } } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once // again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a // hack to use // Lucene 2.4 // since in 2.4 // position // increments // weren't // preserved by // default. // Using a later // version puts // underscores // (_) in the // place of // removed stop // words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, ""); } } // Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); // OffsetAttribute offsetAttribute = // tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); // int tokenCount = 0; tokenStream.reset(); while (tokenStream.incrementToken()) { // int startOffset = offsetAttribute.startOffset(); // int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); // The actual token // term nGrams.add(termToken); // Add all ngrams to the ngram LinkedList // If n-grams are not allowed to overlap, then increment to point of // no overlap if (!overlap) { for (int i = 0; i < length - 1; i++) { tokenStream.incrementToken(); } } } // Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }
From source file:com.finderbots.miner.PhraseShingleAnalyzer.java
License:Apache License
public List<String> getTermList(String contentText) { TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); List<String> result = new ArrayList<String>(contentText.length() / 10); try {/*from w w w.ja v a2 s .c o m*/ while (stream.incrementToken()) { if (termAtt.termLength() > 0) { String term = termAtt.term(); result.add(term); } } } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:com.flaptor.indextank.query.IndexEngineParser.java
License:Apache License
public Iterator<AToken> parseDocumentField(String fieldName, String content) { final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content)); final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class); final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class); final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class); return new AbstractIterator<AToken>() { int currentPosition = 0; @Override// www.j av a 2 s . com protected AToken computeNext() { try { if (!tkstream.incrementToken()) { tkstream.end(); tkstream.close(); return endOfData(); } } catch (IOException e) { //This should never happen, as the reader is a StringReader } //final org.apache.lucene.analysis.Token luceneTk = tkstream.getAttribute(org.apache.lucene.analysis.Token.class); currentPosition += posIncrAttribute.getPositionIncrement(); final int position = currentPosition; final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); final String text = termAtt.term(); return new AToken() { @Override public String getText() { return text; //luceneTk.term(); } @Override public int getPosition() { return position; //luceneTk.getPositionIncrement(); } @Override public int getStartOffset() { return startOffset; } @Override public int getEndOffset() { return endOffset; } }; } }; }
From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) { return null; }// w w w . ja v a2s .co m DataBag bagOfTokens = bagFactory.newDefaultBag(); TokenStream tokenStream = null; try { String lineOfText = input.get(0).toString(); StringReader textInput = new StringReader(lineOfText); tokenStream = analyzer.tokenStream(noField, textInput); CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { Tuple termText = tupleFactory.newTuple(termAttribute.toString()); bagOfTokens.add(termText); termAttribute.setEmpty(); } } finally { if (tokenStream != null) { tokenStream.close(); } } return bagOfTokens; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
private String analyzeQuery(String query, Analyzer analyzer) { if (analyzer != null && query != null && query.length() > 0) { TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query)); StringBuilder newQueryB = new StringBuilder(); try {/*from ww w . j ava2 s . c o m*/ tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); newQueryB.append(term.toString()); newQueryB.append(' '); } tokenStream.end(); return newQueryB.toString().trim(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } } return query; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
/** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * /* w w w .j av a 2s .c o m*/ * @param synonymAnalyzer * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { String origQuery = getQueryStringFromParser(); int queryLen = origQuery.length(); // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(origQuery)); SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create(); boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false); boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false); List<String> synonymBag = new ArrayList<>(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well String termToAdd = term.toString(); if (typeAttribute.type().equals("SYNONYM")) { synonymBag.add(termToAdd); } // Don't quote sibgle term term synonyms if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM") && termToAdd.contains(" ")) { // Don't Quote when original is already surrounded by quotes if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"' || origQuery.charAt(offsetAttribute.endOffset()) != '"') { // make a phrase out of the synonym termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString(); } } if (!bag) { // create a graph of all possible synonym combinations, // e.g. dog bite, hound bite, dog nibble, hound nibble, etc. TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(), offsetAttribute.endOffset()); startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery); } } } tokenStream.end(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } List<String> alternateQueries = synonymBag; if (!bag) { // use a graph rather than a bag List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size()); sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new) .collect(Collectors.toList())); // have to use the start positions and end positions to figure out all possible combinations alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery); } // save for debugging purposes expandedSynonyms = alternateQueries; return createSynonymQueries(solrParams, alternateQueries); }
From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java
License:Apache License
/** * TODO: rewrite tests not to use string comparison. *//* w w w . j av a 2s .c o m*/ private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); while (in.incrementToken()) { out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); out.append(' '); } if (out.length() > 0) out.deleteCharAt(out.length() - 1); in.close(); return out.toString(); }
From source file:com.github.le11.nls.lucene.UIMABaseAnalyzerTest.java
License:Apache License
@Test public void baseUIMAAnalyzerStreamTest() { try {/* www.j a va2s . c o m*/ TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { assertNotNull(offsetAtt); assertNotNull(termAtt); System.out.println("token '" + termAtt.toString() + "' has offset " + offsetAtt.startOffset() + "," + offsetAtt.endOffset()); } } catch (Exception e) { e.printStackTrace(); fail(e.getLocalizedMessage()); } }