List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.exist.indexing.lucene.XMLToQuery.java
License:Open Source License
private Query phraseQuery(String field, Element node, Analyzer analyzer) throws XPathException { NodeList termList = node.getElementsByTagName("term"); if (termList.getLength() == 0) { PhraseQuery query = new PhraseQuery(); String qstr = getText(node); try {/*from w w w .j a v a2s . co m*/ TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { query.add(new Term(field, termAttr.toString())); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; } MultiPhraseQuery query = new MultiPhraseQuery(); for (int i = 0; i < termList.getLength(); i++) { Element elem = (Element) termList.item(i); String text = getText(elem); if (text.indexOf('?') > -1 || text.indexOf('*') > 0) { Term[] expanded = expandTerms(field, text); if (expanded.length > 0) query.add(expanded); } else { String termStr = getTerm(field, text, analyzer); if (termStr != null) query.add(new Term(field, text)); } } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; }
From source file:org.exist.indexing.lucene.XMLToQuery.java
License:Open Source License
private SpanQuery nearQuery(String field, Element node, Analyzer analyzer) throws XPathException { int slop = getSlop(node); if (slop < 0) slop = 0;//from w ww . j av a2s . c om boolean inOrder = true; if (node.hasAttribute("ordered")) inOrder = node.getAttribute("ordered").equals("yes"); if (!hasElementContent(node)) { String qstr = getText(node); List<SpanTermQuery> list = new ArrayList<>(8); try { TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { list.add(new SpanTermQuery(new Term(field, termAttr.toString()))); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } return new SpanNearQuery(list.toArray(new SpanTermQuery[list.size()]), slop, inOrder); } SpanQuery[] children = parseSpanChildren(field, node, analyzer); return new SpanNearQuery(children, slop, inOrder); }
From source file:org.exist.indexing.lucene.XMLToQuery.java
License:Open Source License
private String getTerm(String field, String text, Analyzer analyzer) throws XPathException { String term = null;/* ww w . j a v a2s . com*/ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { term = termAttr.toString(); } stream.end(); stream.close(); return term; } catch (IOException e) { throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e); } }
From source file:org.exist.indexing.range.RangeIndexWorker.java
License:Open Source License
protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs) throws XPathException { final Analyzer analyzer = getAnalyzer(qname, field, docs); if (!isCaseSensitive(qname, field, docs)) { data = data.toLowerCase();//w w w .j a va 2 s .com } if (analyzer == null) { return new BytesRef(data); } try { TokenStream stream = analyzer.tokenStream(field, new StringReader(data)); TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class); BytesRef token = null; try { stream.reset(); if (stream.incrementToken()) { termAttr.fillBytesRef(); token = termAttr.getBytesRef(); } stream.end(); } finally { stream.close(); } return token; } catch (IOException e) { throw new XPathException("Error analyzing the query string: " + e.getMessage(), e); } }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractExcerpt.java
License:Apache License
/** * @param text the text.//from w w w . java 2 s. co m * @return a <code>TermPositionVector</code> for the given text. */ private TermPositionVector createTermPositionVector(String text) { // term -> TermVectorOffsetInfo[] final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>(); Reader r = new StringReader(text); TokenStream ts = index.getTextAnalyzer().tokenStream("", r); try { while (ts.incrementToken()) { OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); String termText = new String(term.buffer(), 0, term.length()); TermVectorOffsetInfo[] info = termMap.get(termText); if (info == null) { info = new TermVectorOffsetInfo[1]; } else { TermVectorOffsetInfo[] tmp = info; info = new TermVectorOffsetInfo[tmp.length + 1]; System.arraycopy(tmp, 0, info, 0, tmp.length); } info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset()); termMap.put(termText, info); } ts.end(); ts.close(); } catch (IOException e) { // should never happen, we are reading from a string if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } return new TermPositionVector() { private String[] terms = termMap.keySet().toArray(new String[termMap.size()]); public int[] getTermPositions(int index) { return null; } public TermVectorOffsetInfo[] getOffsets(int index) { TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO; if (index >= 0 && index < terms.length) { info = termMap.get(terms[index]); } return info; } public String getField() { return ""; } public int size() { return terms.length; } public String[] getTerms() { return terms; } public int[] getTermFrequencies() { int[] freqs = new int[terms.length]; for (int i = 0; i < terms.length; i++) { freqs[i] = termMap.get(terms[i]).length; } return freqs; } public int indexOf(String term) { int res = Arrays.binarySearch(terms, term); return res >= 0 ? res : -1; } public int[] indexesOf(String[] terms, int start, int len) { int[] res = new int[len]; for (int i = 0; i < len; i++) { res[i] = indexOf(terms[i]); } return res; } }; }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.MoreLikeThis.java
License:Apache License
/** * Adds term frequencies found by tokenizing text from reader into the Map words * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis *///from w ww. j a va2 s .c o m private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount = 0; // for every token while (ts.incrementToken()) { CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); String word = new String(term.buffer(), 0, term.length()); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); ts.close(); }
From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java
License:Apache License
@Test public void testAbbreviatedNames() throws Exception { String text = "A survey of the Abies conifers found in Europe. A. is not a abbreviated genus. A. alba, A. betula, A.picea and Picea picea is something else."; Reader input = new StringReader(text); LinkedList<String> expected = new LinkedList<String>(); expected.add("Abies"); expected.add("Abies alba"); expected.add("Abies betula"); expected.add("Abies picea"); expected.add("Picea picea"); TokenStream tokens = getTokens(input); SciNameIterator iter = new SciNameIterator(tokens); for (SciName sn : iter) { // System.out.println(sn); assertEquals(expected.poll(), sn.scientificName); }//from ww w . j av a 2s . c o m tokens.end(); tokens.close(); }
From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java
License:Apache License
/** * Test Biebersteiniaceae eFlora example html that proved to have problems with names found across html tags * Source: http://www.efloras.org/florataxon.aspx?flora_id=2&taxon_id=20048 *///from w w w . j av a 2s . c o m @Test public void testBiebersteiniaceae() throws Exception { Reader input = new InputStreamReader(isu.classpathStream("sources/biebersteiniaceae/document.txt"), "UTF-8"); TokenStream tokens = getTokens(input); SciNameIterator iter = new SciNameIterator(tokens); int count = 0; for (SciName sn : iter) { System.out.println(sn); count++; } System.out.println("Biebersteiniaceae names found: " + count); assertTrue(count == 14); tokens.end(); tokens.close(); }
From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java
License:Apache License
/** * Test bioline html file taken from http://www.bioline.org.br/abstract?id=fb95003 */// w w w. j a v a 2 s . c o m @Test public void testHtml() throws Exception { Reader input = new InputStreamReader(isu.classpathStream("sources/bioline/document.txt"), "UTF-8"); // input = new InputStreamReader(new FileInputStream(new File("/Users/markus/Desktop/bioline-fb95003.html")), "UTF-8"); TokenStream tokens = getTokens(input); SciNameIterator iter = new SciNameIterator(tokens); int count = 0; int countMugil = 0; for (SciName sn : iter) { System.out.println(sn); count++; if (sn.scientificName.startsWith("Mugil ")) { countMugil++; } } System.out.println("BIOLINE names found: " + count); assertTrue(count == 49); assertTrue(countMugil == 12); tokens.end(); tokens.close(); }
From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java
License:Apache License
@Test public void testSimpleText() throws Exception { System.out.println(StringUtils.isAllUpperCase("G")); System.out.println(StringUtils.isAllUpperCase("G")); String text = "Help, Asteraceae or is (Felinia) or Felis (Felinia) foordi found. I can't see any of these famous Abies alba anywhere around here, can you? Maybe this is Potentilla vulgaris L. ? You can't be sure, my dear. Paris is a pretty town too, isn't it? They have big numbers of Passer domesticus subsp. domesticus, the most frequent subspecies of Passer domesticus (Linnaeus, 1758)"; Reader input = new StringReader(text); LinkedList<String> expected = new LinkedList<String>(); expected.add("Asteraceae"); expected.add("Felis (Felinia) foordi"); expected.add("Abies alba"); expected.add("Potentilla vulgaris"); expected.add("Passer domesticus subsp. domesticus"); expected.add("Passer domesticus"); TokenStream tokens = getTokens(input); SciNameIterator iter = new SciNameIterator(tokens); for (SciName sn : iter) { // System.out.println(sn); assertEquals(expected.poll(), sn.scientificName); }/*w ww. j ava 2 s. com*/ tokens.end(); tokens.close(); }