Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private Query phraseQuery(String field, Element node, Analyzer analyzer) throws XPathException {
    NodeList termList = node.getElementsByTagName("term");
    if (termList.getLength() == 0) {
        PhraseQuery query = new PhraseQuery();
        String qstr = getText(node);
        try {/*from   w  w  w  .j  a  v  a2s . co  m*/
            TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr));
            CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                query.add(new Term(field, termAttr.toString()));
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            throw new XPathException("Error while parsing phrase query: " + qstr);
        }
        int slop = getSlop(node);
        if (slop > -1)
            query.setSlop(slop);
        return query;
    }
    MultiPhraseQuery query = new MultiPhraseQuery();
    for (int i = 0; i < termList.getLength(); i++) {
        Element elem = (Element) termList.item(i);
        String text = getText(elem);
        if (text.indexOf('?') > -1 || text.indexOf('*') > 0) {
            Term[] expanded = expandTerms(field, text);
            if (expanded.length > 0)
                query.add(expanded);
        } else {
            String termStr = getTerm(field, text, analyzer);
            if (termStr != null)
                query.add(new Term(field, text));
        }
    }
    int slop = getSlop(node);
    if (slop > -1)
        query.setSlop(slop);
    return query;
}

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private SpanQuery nearQuery(String field, Element node, Analyzer analyzer) throws XPathException {
    int slop = getSlop(node);
    if (slop < 0)
        slop = 0;//from   w  ww .  j av  a2s  . c  om
    boolean inOrder = true;
    if (node.hasAttribute("ordered"))
        inOrder = node.getAttribute("ordered").equals("yes");

    if (!hasElementContent(node)) {
        String qstr = getText(node);
        List<SpanTermQuery> list = new ArrayList<>(8);
        try {
            TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr));
            CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                list.add(new SpanTermQuery(new Term(field, termAttr.toString())));
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            throw new XPathException("Error while parsing phrase query: " + qstr);
        }
        return new SpanNearQuery(list.toArray(new SpanTermQuery[list.size()]), slop, inOrder);
    }
    SpanQuery[] children = parseSpanChildren(field, node, analyzer);
    return new SpanNearQuery(children, slop, inOrder);
}

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private String getTerm(String field, String text, Analyzer analyzer) throws XPathException {
    String term = null;/*  ww  w . j a  v a2s . com*/
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        if (stream.incrementToken()) {
            term = termAttr.toString();
        }
        stream.end();
        stream.close();
        return term;
    } catch (IOException e) {
        throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e);
    }
}

From source file:org.exist.indexing.range.RangeIndexWorker.java

License:Open Source License

protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs)
        throws XPathException {
    final Analyzer analyzer = getAnalyzer(qname, field, docs);
    if (!isCaseSensitive(qname, field, docs)) {
        data = data.toLowerCase();//w w w  .j  a  va 2  s .com
    }
    if (analyzer == null) {
        return new BytesRef(data);
    }
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(data));
        TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
        BytesRef token = null;
        try {
            stream.reset();
            if (stream.incrementToken()) {
                termAttr.fillBytesRef();
                token = termAttr.getBytesRef();
            }
            stream.end();
        } finally {
            stream.close();
        }
        return token;
    } catch (IOException e) {
        throw new XPathException("Error analyzing the query string: " + e.getMessage(), e);
    }
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractExcerpt.java

License:Apache License

/**
 * @param text the text.//from w  w  w .  java  2 s.  co m
 * @return a <code>TermPositionVector</code> for the given text.
 */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
            String termText = new String(term.buffer(), 0, term.length());
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        // should never happen, we are reading from a string
        if (LOG.isTraceEnabled()) {
            LOG.trace("An exception occurred: " + e.getMessage());
        }
    }

    return new TermPositionVector() {

        private String[] terms = termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *///from   w  ww.  j a  va2  s  .c o  m
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    while (ts.incrementToken()) {
        CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
        String word = new String(term.buffer(), 0, term.length());
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java

License:Apache License

@Test
public void testAbbreviatedNames() throws Exception {
    String text = "A survey of the Abies conifers found in Europe. A. is not a abbreviated genus. A. alba, A. betula, A.picea and Picea picea is something else.";
    Reader input = new StringReader(text);
    LinkedList<String> expected = new LinkedList<String>();
    expected.add("Abies");
    expected.add("Abies alba");
    expected.add("Abies betula");
    expected.add("Abies picea");
    expected.add("Picea picea");

    TokenStream tokens = getTokens(input);
    SciNameIterator iter = new SciNameIterator(tokens);
    for (SciName sn : iter) {
        //      System.out.println(sn);
        assertEquals(expected.poll(), sn.scientificName);
    }//from   ww  w . j av a  2s  . c  o  m
    tokens.end();
    tokens.close();
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java

License:Apache License

/**
 * Test Biebersteiniaceae eFlora example html that proved to have problems with names found across html tags
 * Source: http://www.efloras.org/florataxon.aspx?flora_id=2&taxon_id=20048
 *///from   w w w .  j  av  a  2s .  c o  m
@Test
public void testBiebersteiniaceae() throws Exception {
    Reader input = new InputStreamReader(isu.classpathStream("sources/biebersteiniaceae/document.txt"),
            "UTF-8");
    TokenStream tokens = getTokens(input);
    SciNameIterator iter = new SciNameIterator(tokens);
    int count = 0;
    for (SciName sn : iter) {
        System.out.println(sn);
        count++;
    }
    System.out.println("Biebersteiniaceae names found: " + count);
    assertTrue(count == 14);
    tokens.end();
    tokens.close();
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java

License:Apache License

/**
 * Test bioline html file taken from http://www.bioline.org.br/abstract?id=fb95003
 */// w w w. j a  v  a 2 s . c  o  m
@Test
public void testHtml() throws Exception {
    Reader input = new InputStreamReader(isu.classpathStream("sources/bioline/document.txt"), "UTF-8");
    // input = new InputStreamReader(new FileInputStream(new File("/Users/markus/Desktop/bioline-fb95003.html")), "UTF-8");
    TokenStream tokens = getTokens(input);
    SciNameIterator iter = new SciNameIterator(tokens);
    int count = 0;
    int countMugil = 0;
    for (SciName sn : iter) {
        System.out.println(sn);
        count++;
        if (sn.scientificName.startsWith("Mugil ")) {
            countMugil++;
        }
    }
    System.out.println("BIOLINE names found: " + count);
    assertTrue(count == 49);
    assertTrue(countMugil == 12);
    tokens.end();
    tokens.close();
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java

License:Apache License

@Test
public void testSimpleText() throws Exception {
    System.out.println(StringUtils.isAllUpperCase("G"));
    System.out.println(StringUtils.isAllUpperCase("G"));
    String text = "Help, Asteraceae or is (Felinia) or Felis (Felinia) foordi found. I can't see any of these famous Abies alba anywhere around here, can you? Maybe this is Potentilla vulgaris L. ? You can't be sure, my dear. Paris is a pretty town too, isn't it? They have big numbers of Passer domesticus subsp. domesticus, the most frequent subspecies of Passer domesticus (Linnaeus, 1758)";
    Reader input = new StringReader(text);
    LinkedList<String> expected = new LinkedList<String>();
    expected.add("Asteraceae");
    expected.add("Felis (Felinia) foordi");
    expected.add("Abies alba");
    expected.add("Potentilla vulgaris");
    expected.add("Passer domesticus subsp. domesticus");
    expected.add("Passer domesticus");
    TokenStream tokens = getTokens(input);
    SciNameIterator iter = new SciNameIterator(tokens);
    for (SciName sn : iter) {
        //      System.out.println(sn);
        assertEquals(expected.poll(), sn.scientificName);
    }/*w ww. j ava 2  s. com*/
    tokens.end();
    tokens.close();
}