List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:org.exist.indexing.range.RangeIndexWorker.java
License:Open Source License
protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs) throws XPathException { final Analyzer analyzer = getAnalyzer(qname, field, docs); if (!isCaseSensitive(qname, field, docs)) { data = data.toLowerCase();//from w w w . j a va2 s . c o m } if (analyzer == null) { return new BytesRef(data); } try { TokenStream stream = analyzer.tokenStream(field, new StringReader(data)); TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class); BytesRef token = null; try { stream.reset(); if (stream.incrementToken()) { termAttr.fillBytesRef(); token = termAttr.getBytesRef(); } stream.end(); } finally { stream.close(); } return token; } catch (IOException e) { throw new XPathException("Error analyzing the query string: " + e.getMessage(), e); } }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractExcerpt.java
License:Apache License
/** * @param text the text./* w w w.j a va2 s. co m*/ * @return a <code>TermPositionVector</code> for the given text. */ private TermPositionVector createTermPositionVector(String text) { // term -> TermVectorOffsetInfo[] final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>(); Reader r = new StringReader(text); TokenStream ts = index.getTextAnalyzer().tokenStream("", r); try { while (ts.incrementToken()) { OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); String termText = new String(term.buffer(), 0, term.length()); TermVectorOffsetInfo[] info = termMap.get(termText); if (info == null) { info = new TermVectorOffsetInfo[1]; } else { TermVectorOffsetInfo[] tmp = info; info = new TermVectorOffsetInfo[tmp.length + 1]; System.arraycopy(tmp, 0, info, 0, tmp.length); } info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset()); termMap.put(termText, info); } ts.end(); ts.close(); } catch (IOException e) { // should never happen, we are reading from a string if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } return new TermPositionVector() { private String[] terms = termMap.keySet().toArray(new String[termMap.size()]); public int[] getTermPositions(int index) { return null; } public TermVectorOffsetInfo[] getOffsets(int index) { TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO; if (index >= 0 && index < terms.length) { info = termMap.get(terms[index]); } return info; } public String getField() { return ""; } public int size() { return terms.length; } public String[] getTerms() { return terms; } public int[] getTermFrequencies() { int[] freqs = new int[terms.length]; for (int i = 0; i < terms.length; i++) { freqs[i] = termMap.get(terms[i]).length; } return freqs; } public int indexOf(String term) { int res = Arrays.binarySearch(terms, term); return res >= 0 ? res : -1; } public int[] indexesOf(String[] terms, int start, int len) { int[] res = new int[len]; for (int i = 0; i < len; i++) { res[i] = indexOf(terms[i]); } return res; } }; }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.MoreLikeThis.java
License:Apache License
/** * Adds term frequencies found by tokenizing text from reader into the Map words * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis *//*w w w . ja v a2 s. c o m*/ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount = 0; // for every token while (ts.incrementToken()) { CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); String word = new String(term.buffer(), 0, term.length()); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); ts.close(); }
From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java
License:Apache License
@Test public void testAbbreviatedNames() throws Exception { String text = "A survey of the Abies conifers found in Europe. A. is not a abbreviated genus. A. alba, A. betula, A.picea and Picea picea is something else."; Reader input = new StringReader(text); LinkedList<String> expected = new LinkedList<String>(); expected.add("Abies"); expected.add("Abies alba"); expected.add("Abies betula"); expected.add("Abies picea"); expected.add("Picea picea"); TokenStream tokens = getTokens(input); SciNameIterator iter = new SciNameIterator(tokens); for (SciName sn : iter) { // System.out.println(sn); assertEquals(expected.poll(), sn.scientificName); }// w w w. ja va 2 s.c o m tokens.end(); tokens.close(); }
From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java
License:Apache License
/** * Test Biebersteiniaceae eFlora example html that proved to have problems with names found across html tags * Source: http://www.efloras.org/florataxon.aspx?flora_id=2&taxon_id=20048 *//*w w w . ja v a 2 s .co m*/ @Test public void testBiebersteiniaceae() throws Exception { Reader input = new InputStreamReader(isu.classpathStream("sources/biebersteiniaceae/document.txt"), "UTF-8"); TokenStream tokens = getTokens(input); SciNameIterator iter = new SciNameIterator(tokens); int count = 0; for (SciName sn : iter) { System.out.println(sn); count++; } System.out.println("Biebersteiniaceae names found: " + count); assertTrue(count == 14); tokens.end(); tokens.close(); }
From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java
License:Apache License
/** * Test bioline html file taken from http://www.bioline.org.br/abstract?id=fb95003 *///from w w w .j ava 2s . c o m @Test public void testHtml() throws Exception { Reader input = new InputStreamReader(isu.classpathStream("sources/bioline/document.txt"), "UTF-8"); // input = new InputStreamReader(new FileInputStream(new File("/Users/markus/Desktop/bioline-fb95003.html")), "UTF-8"); TokenStream tokens = getTokens(input); SciNameIterator iter = new SciNameIterator(tokens); int count = 0; int countMugil = 0; for (SciName sn : iter) { System.out.println(sn); count++; if (sn.scientificName.startsWith("Mugil ")) { countMugil++; } } System.out.println("BIOLINE names found: " + count); assertTrue(count == 49); assertTrue(countMugil == 12); tokens.end(); tokens.close(); }
From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java
License:Apache License
@Test public void testSimpleText() throws Exception { System.out.println(StringUtils.isAllUpperCase("G")); System.out.println(StringUtils.isAllUpperCase("G")); String text = "Help, Asteraceae or is (Felinia) or Felis (Felinia) foordi found. I can't see any of these famous Abies alba anywhere around here, can you? Maybe this is Potentilla vulgaris L. ? You can't be sure, my dear. Paris is a pretty town too, isn't it? They have big numbers of Passer domesticus subsp. domesticus, the most frequent subspecies of Passer domesticus (Linnaeus, 1758)"; Reader input = new StringReader(text); LinkedList<String> expected = new LinkedList<String>(); expected.add("Asteraceae"); expected.add("Felis (Felinia) foordi"); expected.add("Abies alba"); expected.add("Potentilla vulgaris"); expected.add("Passer domesticus subsp. domesticus"); expected.add("Passer domesticus"); TokenStream tokens = getTokens(input); SciNameIterator iter = new SciNameIterator(tokens); for (SciName sn : iter) { // System.out.println(sn); assertEquals(expected.poll(), sn.scientificName); }/*from w w w .jav a 2 s.co m*/ tokens.end(); tokens.close(); }
From source file:org.genemania.completion.lucene.GeneCompletionProvider.java
License:Open Source License
public Long getNodeId(String symbol) { try {// ww w .ja v a 2 s.co m TokenStream tokens = analyze(symbol); PhraseQuery query = new PhraseQuery(); tokens.reset(); while (tokens.incrementToken()) { TermAttribute term = tokens.getAttribute(TermAttribute.class); query.add(new Term(GeneIndexBuilder.GENE_FIELD, term.term())); } tokens.end(); tokens.close(); final Set<Long> nodes = new HashSet<Long>(); searcher.search(query, new AbstractCollector() { @Override public void handleHit(int id) { try { Document document = searcher.doc(id); nodes.add(Long.parseLong(document.get(GeneIndexBuilder.NODE_ID_FIELD))); } catch (IOException e) { log(e); } } }); if (nodes.size() > 0) { return nodes.iterator().next(); } } catch (IOException e) { log(e); } return null; }
From source file:org.genemania.data.classification.lucene.LuceneGeneClassifier.java
License:Open Source License
public void classify(final String symbol, final IGeneClassificationHandler handler) throws ApplicationException { try {//from w w w . j ava 2 s . c o m TokenStream tokens = analyze(symbol); PhraseQuery query = new PhraseQuery(); tokens.reset(); while (tokens.incrementToken()) { TermAttribute term = tokens.getAttribute(TermAttribute.class); query.add(new Term(LuceneMediator.GENE_SYMBOL, term.term())); } tokens.end(); tokens.close(); searcher.search(query, new AbstractCollector() { @Override public void handleHit(int doc) { try { Document document = searcher.doc(doc); long organismId = Long.parseLong(document.get(LuceneMediator.GENE_ORGANISM_ID)); handler.handleClassification(symbol, organismId); } catch (IOException e) { log(e); } } }); } catch (IOException e) { throw new ApplicationException(e); } }
From source file:org.genemania.mediator.lucene.LuceneMediator.java
License:Open Source License
protected PhraseQuery createPhraseQuery(String field, String phrase) throws IOException { TokenStream stream = analyze(phrase); stream.reset();//from ww w . j ava2 s . c o m PhraseQuery query = new PhraseQuery(); while (stream.incrementToken()) { TermAttribute term = stream.getAttribute(TermAttribute.class); query.add(new Term(field, term.term())); } stream.end(); stream.close(); return query; }