Example usage for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:org.exist.indexing.range.RangeIndexWorker.java

License:Open Source License

protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs)
        throws XPathException {
    final Analyzer analyzer = getAnalyzer(qname, field, docs);
    if (!isCaseSensitive(qname, field, docs)) {
        data = data.toLowerCase();//from   w  w  w  .  j a  va2 s .  c  o m
    }
    if (analyzer == null) {
        return new BytesRef(data);
    }
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(data));
        TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
        BytesRef token = null;
        try {
            stream.reset();
            if (stream.incrementToken()) {
                termAttr.fillBytesRef();
                token = termAttr.getBytesRef();
            }
            stream.end();
        } finally {
            stream.close();
        }
        return token;
    } catch (IOException e) {
        throw new XPathException("Error analyzing the query string: " + e.getMessage(), e);
    }
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractExcerpt.java

License:Apache License

/**
 * @param text the text./*  w w w.j a va2 s.  co m*/
 * @return a <code>TermPositionVector</code> for the given text.
 */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
            String termText = new String(term.buffer(), 0, term.length());
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        // should never happen, we are reading from a string
        if (LOG.isTraceEnabled()) {
            LOG.trace("An exception occurred: " + e.getMessage());
        }
    }

    return new TermPositionVector() {

        private String[] terms = termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*w w w . ja v a2 s. c  o  m*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    while (ts.incrementToken()) {
        CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
        String word = new String(term.buffer(), 0, term.length());
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java

License:Apache License

@Test
public void testAbbreviatedNames() throws Exception {
    String text = "A survey of the Abies conifers found in Europe. A. is not a abbreviated genus. A. alba, A. betula, A.picea and Picea picea is something else.";
    Reader input = new StringReader(text);
    LinkedList<String> expected = new LinkedList<String>();
    expected.add("Abies");
    expected.add("Abies alba");
    expected.add("Abies betula");
    expected.add("Abies picea");
    expected.add("Picea picea");

    TokenStream tokens = getTokens(input);
    SciNameIterator iter = new SciNameIterator(tokens);
    for (SciName sn : iter) {
        //      System.out.println(sn);
        assertEquals(expected.poll(), sn.scientificName);
    }//  w  w  w.  ja va 2  s.c  o m
    tokens.end();
    tokens.close();
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java

License:Apache License

/**
 * Test Biebersteiniaceae eFlora example html that proved to have problems with names found across html tags
 * Source: http://www.efloras.org/florataxon.aspx?flora_id=2&taxon_id=20048
 *//*w  w w  . ja v  a  2 s  .co m*/
@Test
public void testBiebersteiniaceae() throws Exception {
    Reader input = new InputStreamReader(isu.classpathStream("sources/biebersteiniaceae/document.txt"),
            "UTF-8");
    TokenStream tokens = getTokens(input);
    SciNameIterator iter = new SciNameIterator(tokens);
    int count = 0;
    for (SciName sn : iter) {
        System.out.println(sn);
        count++;
    }
    System.out.println("Biebersteiniaceae names found: " + count);
    assertTrue(count == 14);
    tokens.end();
    tokens.close();
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java

License:Apache License

/**
 * Test bioline html file taken from http://www.bioline.org.br/abstract?id=fb95003
 *///from w w w  .j  ava 2s .  c o m
@Test
public void testHtml() throws Exception {
    Reader input = new InputStreamReader(isu.classpathStream("sources/bioline/document.txt"), "UTF-8");
    // input = new InputStreamReader(new FileInputStream(new File("/Users/markus/Desktop/bioline-fb95003.html")), "UTF-8");
    TokenStream tokens = getTokens(input);
    SciNameIterator iter = new SciNameIterator(tokens);
    int count = 0;
    int countMugil = 0;
    for (SciName sn : iter) {
        System.out.println(sn);
        count++;
        if (sn.scientificName.startsWith("Mugil ")) {
            countMugil++;
        }
    }
    System.out.println("BIOLINE names found: " + count);
    assertTrue(count == 49);
    assertTrue(countMugil == 12);
    tokens.end();
    tokens.close();
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameAnalyzerTest.java

License:Apache License

@Test
public void testSimpleText() throws Exception {
    System.out.println(StringUtils.isAllUpperCase("G"));
    System.out.println(StringUtils.isAllUpperCase("G"));
    String text = "Help, Asteraceae or is (Felinia) or Felis (Felinia) foordi found. I can't see any of these famous Abies alba anywhere around here, can you? Maybe this is Potentilla vulgaris L. ? You can't be sure, my dear. Paris is a pretty town too, isn't it? They have big numbers of Passer domesticus subsp. domesticus, the most frequent subspecies of Passer domesticus (Linnaeus, 1758)";
    Reader input = new StringReader(text);
    LinkedList<String> expected = new LinkedList<String>();
    expected.add("Asteraceae");
    expected.add("Felis (Felinia) foordi");
    expected.add("Abies alba");
    expected.add("Potentilla vulgaris");
    expected.add("Passer domesticus subsp. domesticus");
    expected.add("Passer domesticus");
    TokenStream tokens = getTokens(input);
    SciNameIterator iter = new SciNameIterator(tokens);
    for (SciName sn : iter) {
        //      System.out.println(sn);
        assertEquals(expected.poll(), sn.scientificName);
    }/*from  w  w  w .jav  a 2  s.co  m*/
    tokens.end();
    tokens.close();
}

From source file:org.genemania.completion.lucene.GeneCompletionProvider.java

License:Open Source License

public Long getNodeId(String symbol) {
    try {//  ww w  .ja  v a  2  s.co  m
        TokenStream tokens = analyze(symbol);
        PhraseQuery query = new PhraseQuery();
        tokens.reset();
        while (tokens.incrementToken()) {
            TermAttribute term = tokens.getAttribute(TermAttribute.class);
            query.add(new Term(GeneIndexBuilder.GENE_FIELD, term.term()));
        }
        tokens.end();
        tokens.close();

        final Set<Long> nodes = new HashSet<Long>();
        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int id) {
                try {
                    Document document = searcher.doc(id);
                    nodes.add(Long.parseLong(document.get(GeneIndexBuilder.NODE_ID_FIELD)));
                } catch (IOException e) {
                    log(e);
                }
            }
        });
        if (nodes.size() > 0) {
            return nodes.iterator().next();
        }
    } catch (IOException e) {
        log(e);
    }
    return null;
}

From source file:org.genemania.data.classification.lucene.LuceneGeneClassifier.java

License:Open Source License

public void classify(final String symbol, final IGeneClassificationHandler handler)
        throws ApplicationException {
    try {//from   w  w w . j  ava 2 s . c  o m
        TokenStream tokens = analyze(symbol);
        PhraseQuery query = new PhraseQuery();
        tokens.reset();
        while (tokens.incrementToken()) {
            TermAttribute term = tokens.getAttribute(TermAttribute.class);
            query.add(new Term(LuceneMediator.GENE_SYMBOL, term.term()));
        }
        tokens.end();
        tokens.close();

        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int doc) {
                try {
                    Document document = searcher.doc(doc);
                    long organismId = Long.parseLong(document.get(LuceneMediator.GENE_ORGANISM_ID));
                    handler.handleClassification(symbol, organismId);
                } catch (IOException e) {
                    log(e);
                }
            }
        });
    } catch (IOException e) {
        throw new ApplicationException(e);
    }
}

From source file:org.genemania.mediator.lucene.LuceneMediator.java

License:Open Source License

protected PhraseQuery createPhraseQuery(String field, String phrase) throws IOException {
    TokenStream stream = analyze(phrase);
    stream.reset();//from  ww w . j ava2 s  .  c o m
    PhraseQuery query = new PhraseQuery();
    while (stream.incrementToken()) {
        TermAttribute term = stream.getAttribute(TermAttribute.class);
        query.add(new Term(field, term.term()));
    }
    stream.end();
    stream.close();
    return query;
}