List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:com.faqit.similarity.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. Can handle ngrams of any length * and also perform stop word removal before extraction * //from w w w.j a va2s . co m * @param text * the text that the ngrams should be extracted from * @param length * the length of the ngrams * @param stopWords * whether or not stopwords should be removed before extraction * @param overlap * whether or not the ngrams should overlap */ public void extract(String text, int length, Boolean stopWords, Boolean overlap) throws FileNotFoundException, IOException { this.text = text; this.length = length; this.stopWords = stopWords; this.overlap = overlap; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* * If the minLength and maxLength are both 1, then we want unigrams Make * use of a StopAnalyzer when stopwords should be removed Make use of a * SimpleAnalyzer when stop words should be included */ if (length == 1) { if (this.stopWords) { analyzer = new StandardAnalyzer(); } else { analyzer = new SimpleAnalyzer(); } } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once // again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a // hack to use // Lucene 2.4 // since in 2.4 // position // increments // weren't // preserved by // default. // Using a later // version puts // underscores // (_) in the // place of // removed stop // words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, ""); } } // Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); // OffsetAttribute offsetAttribute = // tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); // int tokenCount = 0; tokenStream.reset(); while (tokenStream.incrementToken()) { // int startOffset = offsetAttribute.startOffset(); // int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); // The actual token // term nGrams.add(termToken); // Add all ngrams to the ngram LinkedList // If n-grams are not allowed to overlap, then increment to point of // no overlap if (!overlap) { for (int i = 0; i < length - 1; i++) { tokenStream.incrementToken(); } } } // Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }
From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) { return null; }/*from w w w. ja v a 2 s .c o m*/ DataBag bagOfTokens = bagFactory.newDefaultBag(); TokenStream tokenStream = null; try { String lineOfText = input.get(0).toString(); StringReader textInput = new StringReader(lineOfText); tokenStream = analyzer.tokenStream(noField, textInput); CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { Tuple termText = tupleFactory.newTuple(termAttribute.toString()); bagOfTokens.add(termText); termAttribute.setEmpty(); } } finally { if (tokenStream != null) { tokenStream.close(); } } return bagOfTokens; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
private String analyzeQuery(String query, Analyzer analyzer) { if (analyzer != null && query != null && query.length() > 0) { TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query)); StringBuilder newQueryB = new StringBuilder(); try {// w w w . j a v a 2s. c om tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); newQueryB.append(term.toString()); newQueryB.append(' '); } tokenStream.end(); return newQueryB.toString().trim(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } } return query; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
/** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * /*from w ww.jav a 2 s.com*/ * @param synonymAnalyzer * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { String origQuery = getQueryStringFromParser(); int queryLen = origQuery.length(); // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(origQuery)); SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create(); boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false); boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false); List<String> synonymBag = new ArrayList<>(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well String termToAdd = term.toString(); if (typeAttribute.type().equals("SYNONYM")) { synonymBag.add(termToAdd); } // Don't quote sibgle term term synonyms if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM") && termToAdd.contains(" ")) { // Don't Quote when original is already surrounded by quotes if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"' || origQuery.charAt(offsetAttribute.endOffset()) != '"') { // make a phrase out of the synonym termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString(); } } if (!bag) { // create a graph of all possible synonym combinations, // e.g. dog bite, hound bite, dog nibble, hound nibble, etc. TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(), offsetAttribute.endOffset()); startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery); } } } tokenStream.end(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } List<String> alternateQueries = synonymBag; if (!bag) { // use a graph rather than a bag List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size()); sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new) .collect(Collectors.toList())); // have to use the start positions and end positions to figure out all possible combinations alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery); } // save for debugging purposes expandedSynonyms = alternateQueries; return createSynonymQueries(solrParams, alternateQueries); }
From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java
License:Apache License
public void testCreateAnalyzerWrapper() throws IOException { PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper(); TokenStream tokenStream = null; CharTermAttribute charTermAttribute = null; List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1")); List<String> actualIdTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("id", "1"); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { actualIdTermList.add(charTermAttribute.toString()); }//from w w w . j a va 2 s. com tokenStream.close(); assertEquals(expectedIdTermList, actualIdTermList); List<String> expectedTextTermList = new LinkedList<String>( Arrays.asList("lucene", "is", "a", "full", "text", "search", "library")); List<String> actualTextTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library."); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { actualTextTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedTextTermList, actualTextTermList); }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private List<LuceneToken> readTokens(TokenStream tokenStream) throws IOException { ArrayList<LuceneToken> tokens = new ArrayList<LuceneToken>(); HashMap<Integer, LuceneToken> tokensByStartOffset = new HashMap<Integer, LuceneToken>(); addAttributes(tokenStream);//ww w .j av a 2 s. c o m tokenStream.reset(); while (tokenStream.incrementToken()) { if (tokenStream.hasAttributes()) { LuceneToken token = new LuceneToken(); readOffset(tokenStream, token); // Lucene may output multiple tokens for compound words LuceneToken tokenWithSameStartOffset = tokensByStartOffset.get(token.getStartOffset()); if (tokenWithSameStartOffset != null) { if (token.getEndOffset() >= tokenWithSameStartOffset.getEndOffset()) { continue; } else { tokens.remove(tokenWithSameStartOffset); } } readReading(tokenStream, token); readPartOfSpeech(tokenStream, token); readInflection(tokenStream, token); readBaseForm(tokenStream, token); tokensByStartOffset.put(token.getStartOffset(), token); tokens.add(token); } } tokenStream.end(); tokenStream.close(); return tokens; }
From source file:com.github.rnewson.couchdb.lucene.util.AnalyzersTest.java
License:Apache License
private String[] analyze(final String analyzerName, final String text) throws Exception { final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName); final TokenStream stream = analyzer.tokenStream("default", new StringReader(text)); stream.reset(); final List<String> result = new ArrayList<String>(); while (stream.incrementToken()) { final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class); result.add(c.toString());/* w w w . j ava2s.c om*/ } return result.toArray(new String[0]); }
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); TokenStream ts = analyzer.tokenStream(field, value); ts.reset(); while (ts.incrementToken()) { CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); String token = new String(termAttribute.buffer(), 0, termAttribute.length()); tokens.add(token);//w ww. j ava 2 s . c o m } ts.end(); ts.close(); return tokens; }
From source file:com.globalsight.ling.lucene.Index.java
License:Apache License
/** * Executes a search in the index returning no more than p_maxHits * (suggested: 5-10), and having no score smaller than p_minScore. * * This implementation is based on Lucene and Lucene score values * float widely, making it hard to specify a useful cut-off like * 0.7 or 0.5. Good scores can be < 0.2. All that is guaranteed is * that scores are numerically ordered. Use p_maxHits instead. *//*from w w w . j a v a 2s .c om*/ public Hits search(String p_text, int end, int begin, float p_minScore) throws IOException, InterruptedException { synchronized (m_state) { if (m_state != STATE_OPENED) { throw new IOException("index is not available"); } } try { m_lock.readLock().acquire(); try { // Search the current index. //IndexReader reader = DirectoryReader.open(m_fsDir); //IndexSearcher searcher = new IndexSearcher(reader); IndexSearcher searcher = LuceneCache.getLuceneCache(m_directory).getIndexSearcher(); Query query = getQuery(p_text); int maxHits = end - begin; TopDocs topDocs = searcher.search(query, maxHits); if (topDocs.totalHits > 0) { noResult = false; } // Store results in our own object. Hits result = new Hits(searcher, topDocs.scoreDocs, end, begin, p_minScore, p_text); // Highlight query terms in long results. if (m_type == TYPE_TEXT) { // Note: rewrite MultiTermQuery, RangeQuery or PrefixQuery. // TODO: optimize object creation if it all works. Highlighter highlighter = new Highlighter(new SimpleFormatter(), new QueryScorer(query)); int max = Math.min(end, topDocs.totalHits); for (int i = begin; i < max; i++) { Document doc = searcher.doc(topDocs.scoreDocs[i].doc); String text = doc.get(IndexDocument.TEXT); TokenStream tokenStream = m_analyzer.tokenStream(IndexDocument.TEXT, new StringReader(text)); tokenStream.reset(); // Get 3 best fragments and separate with "..." String hilite = highlighter.getBestFragments(tokenStream, text, 3, "..."); result.getHit(i).setText(hilite); } } //searcher.close(); // reader.close(); return result; } finally { m_lock.readLock().release(); } } catch (InterruptedException ex) { throw new IOException(ex.getMessage()); } }
From source file:com.globalsight.ling.lucene.TbFuzzyIndex.java
License:Apache License
protected Query getQuery(String p_text) throws IOException { BooleanQuery result = new BooleanQuery(); if (AnalyzerFactory.TOKENIZE_3GRAM == m_tokenize) { m_analyzer = AnalyzerFactory.getInstance(getLocale(), m_tokenize); }//from ww w. j a v a 2s . com TokenStream tokens = m_analyzer.tokenStream(IndexDocument.TEXT, new StringReader(p_text)); tokens.reset(); Token t; while ((t = LuceneUtil.getNextToken(tokens)) != null) { result.add(new BooleanClause(new TermQuery(new Term(IndexDocument.TEXT, t.toString())), Occur.SHOULD)); } return result; }