List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java
License:Apache License
/** * TODO: rewrite tests not to use string comparison. *//*from w w w . ja v a 2 s . com*/ private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); while (in.incrementToken()) { out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); out.append(' '); } if (out.length() > 0) out.deleteCharAt(out.length() - 1); in.close(); return out.toString(); }
From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java
License:Apache License
public void testCreateAnalyzerWrapper() throws IOException { PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper(); TokenStream tokenStream = null; CharTermAttribute charTermAttribute = null; List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1")); List<String> actualIdTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("id", "1"); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*from w w w .j a v a2 s . co m*/ while (tokenStream.incrementToken()) { actualIdTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedIdTermList, actualIdTermList); List<String> expectedTextTermList = new LinkedList<String>( Arrays.asList("lucene", "is", "a", "full", "text", "search", "library")); List<String> actualTextTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library."); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { actualTextTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedTextTermList, actualTextTermList); }
From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java
License:Apache License
public List<String> tokenize(String text) { List<String> words = new ArrayList<String>(); if (text != null && !text.isEmpty()) { TokenStream tokenStream = this.createTokenStream(text); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try {//from ww w .jav a 2 s .c o m while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); words.add(term); } } catch (IOException ioe) { LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe); } finally { try { tokenStream.end(); tokenStream.close(); } catch (IOException e) { // Can't do nothing!! LOGGER.error("Unable to close token stream : " + e.getMessage()); } } } return words; }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private List<LuceneToken> readTokens(TokenStream tokenStream) throws IOException { ArrayList<LuceneToken> tokens = new ArrayList<LuceneToken>(); HashMap<Integer, LuceneToken> tokensByStartOffset = new HashMap<Integer, LuceneToken>(); addAttributes(tokenStream);//from w ww . j a v a2 s . c om tokenStream.reset(); while (tokenStream.incrementToken()) { if (tokenStream.hasAttributes()) { LuceneToken token = new LuceneToken(); readOffset(tokenStream, token); // Lucene may output multiple tokens for compound words LuceneToken tokenWithSameStartOffset = tokensByStartOffset.get(token.getStartOffset()); if (tokenWithSameStartOffset != null) { if (token.getEndOffset() >= tokenWithSameStartOffset.getEndOffset()) { continue; } else { tokens.remove(tokenWithSameStartOffset); } } readReading(tokenStream, token); readPartOfSpeech(tokenStream, token); readInflection(tokenStream, token); readBaseForm(tokenStream, token); tokensByStartOffset.put(token.getStartOffset(), token); tokens.add(token); } } tokenStream.end(); tokenStream.close(); return tokens; }
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); TokenStream ts = analyzer.tokenStream(field, value); ts.reset();/*from w ww . ja v a 2 s. com*/ while (ts.incrementToken()) { CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); String token = new String(termAttribute.buffer(), 0, termAttribute.length()); tokens.add(token); } ts.end(); ts.close(); return tokens; }
From source file:com.globalsight.ling.lucene.highlight.Highlighter.java
License:Apache License
/** * Low level api to get the most relevant (formatted) sections of * the document./* w w w . j a va 2s. c o m*/ * * This method has been made public to allow visibility of score * information held in TextFragment objects. Thanks to Jason * Calabrese for help in redefining the interface. * @param tokenStream * @param text * @param maxNumFragments * @param mergeContiguousFragments * @return * @throws IOException */ public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException { ArrayList docFrags = new ArrayList(); StringBuffer newText = new StringBuffer(); TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { org.apache.lucene.analysis.Token token; String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; textFragmenter.start(text); TokenGroup tokenGroup = new TokenGroup(); while ((token = LuceneUtil.getNextToken(tokenStream)) != null) { if (tokenGroup.numTokens > 0 && tokenGroup.isDistinct(token)) { // the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.startOffset; endOffset = tokenGroup.endOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(tokenText, tokenGroup); // store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(text.substring(lastEndOffset, startOffset)); newText.append(markedUpText); lastEndOffset = endOffset; tokenGroup.clear(); // check if current token marks the start of a new fragment if (textFragmenter.isNewFragment(token)) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment currentFrag.textEndPos = newText.length(); currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); } } tokenGroup.addToken(token, fragmentScorer.getTokenScore(token)); if (lastEndOffset > maxDocBytesToAnalyze) { break; } } currentFrag.setScore(fragmentScorer.getFragmentScore()); if (tokenGroup.numTokens > 0) { // flush the accumulated text (same code as in above loop) startOffset = tokenGroup.startOffset; endOffset = tokenGroup.endOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(tokenText, tokenGroup); // store any whitespace etc from between this and last group if (startOffset > lastEndOffset) { newText.append(text.substring(lastEndOffset, startOffset)); } newText.append(markedUpText); lastEndOffset = endOffset; } // append text after end of last token if (lastEndOffset < text.length()) { newText.append(text.substring(lastEndOffset)); } currentFrag.textEndPos = newText.length(); // sort the most relevant sections of the text for (int i = 0, max = docFrags.size(); i < max; i++) { currentFrag = (TextFragment) docFrags.get(i); fragQueue.insertWithOverflow(currentFrag); } // return the most relevant fragments TextFragment result[] = new TextFragment[fragQueue.size()]; for (int i = result.length - 1; i >= 0; i--) { result[i] = (TextFragment) fragQueue.pop(); } // merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(result); ArrayList fragTexts = new ArrayList(); for (int i = 0; i < result.length; i++) { if (result[i] != null && result[i].getScore() > 0) { fragTexts.add(result[i]); } } result = (TextFragment[]) fragTexts.toArray(new TextFragment[0]); } return result; } finally { if (tokenStream != null) { try { tokenStream.close(); } catch (Exception e) { } } } }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string * using GsAnalyzer./*from w w w. jav a 2s . co m*/ * * @param p_text fuzzy match format string * @return List of c.g.l.tm2.index.Tokens */ public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class); //org.apache.lucene.analysis.Token luceneToken = null; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { // luceneToken = gsAtt.getToken(); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return buildTokenList(tokens); }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string * using GsAnalyzer. This method is suitable for use with TM3 * fuzzy indices, and does two things differently than createGsTokens(): * 1) It returns tokens in the order in which they appear * 2) It does not collapse duplicate tokens (and correspondingly does * not return count information)/*from w w w. j a va 2 s . c om*/ * * @param p_text fuzzy match format string * @return List of Strings, each representing one token */ public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return tokens; }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
@SuppressWarnings("resource") public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale, false); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset();/*from www . j a va 2 s .c o m*/ List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return tokens; }
From source file:com.jaeksoft.searchlib.request.SearchField.java
License:Open Source License
final private List<TermQueryItem> getTermQueryFilter(final PerFieldAnalyzer perFieldAnalyzer, CompiledAnalyzer compiledAnalyzer, final String queryString) throws IOException { TokenStream ts = null; TokenQueryFilter.TermQueryFilter tqf = null; Analyzer analyzer = compiledAnalyzer != null ? compiledAnalyzer : perFieldAnalyzer.getKeywordAnalyzer(); try {//from w w w . j a va 2 s . c o m ts = analyzer.tokenStream(field, new StringReader(queryString)); tqf = new TermQueryFilter(compiledAnalyzer, field, (float) termBoost, ts); while (tqf.incrementToken()) ; ts.end(); ts.close(); tqf.sortByOffset(); TermQueryFilter.includeChildrenBrothers(tqf.termQueryItems); for (TermQueryItem termQueryItem : tqf.termQueryItems) termQueryItem.includeChildrenBrothers(); return tqf.termQueryItems; } finally { IOUtils.close(tqf, ts, analyzer); } }